import struct import warnings import numpy as np import re import numpy.typing as npt from typing import Sequence import os from enum import IntEnum import torch if not torch.xpu.is_available(): import KTransformersOps from safetensors import safe_open from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant from ktransformers.util.custom_gguf import * from safetensors.torch import save_file from abc import ABC, abstractmethod from typing import Dict, Any, Optional, Union class ModelLoader(ABC): """ Abstract base class for model loaders. Defines the interface that all model loaders must implement. """ tensor_file_map = {} @abstractmethod def has_tensor(cls, name: str): """ Check if the tensor exists in the loader. Args: name: Name of the tensor to check Returns: bool: True if the tensor exists, False otherwise """ pass class SafeTensorLoader(ModelLoader): tensor_file_map: dict tensor_type_map: dict file_handle_map: dict tensor_device_map: dict def __init__(self, file_path: str): self.__load_tensor_file_map(file_path) def __load_tensor_file_map(self, file_path: str): # 处理传入路径,确保是文件夹路径 if not os.path.exists(file_path): raise FileNotFoundError(f"Path not found: {file_path}") if os.path.isfile(file_path): folder_path = os.path.dirname(file_path) else: folder_path = file_path self.file_handle_map = {} self.tensor_file_map = {} self.tensor_type_map = {} self.tensor_device_map = {} found_safetensor = False for root, _, files in os.walk(folder_path): files = sorted(files) for file in files: if file.endswith(".safetensors"): found_safetensor = True file_path = os.path.join(root, file) if file not in self.file_handle_map: try: handle = safe_open(file_path, framework="pt") self.file_handle_map[file] = handle except Exception as e: print(f"Error opening Safetensor file {file_path}: {e}") continue f = self.file_handle_map.get(file) if f is None: continue try: for key in f.keys(): self.tensor_file_map[key] = file except Exception as e: print(f"Error reading Safetensor file {file_path}: {e}") # if not found_safetensor: # raise FileNotFoundError(f"No Safetensor files found in {folder_path}") def load_tensor(self, key: str, device: str="cpu"): if translate_name_to_gguf(key) in self.tensor_file_map: key = translate_name_to_gguf(key) elif key in self.tensor_file_map: pass else: raise KeyError(f"Key {key} not found in Safetensor files") file = self.tensor_file_map[key] f = self.file_handle_map.get(file) if f is None: raise FileNotFoundError(f"File {file} not found in Safetensor files") tensor = f.get_tensor(key) return tensor.to(device) def load_experts(self, key: str, device: str="cpu"): ''' Load experts from safetensor key: the name of the experts device: the device to load the experts to return: dict, {up: tensor, down: tensor, gate: tensor, up_type: int, down_type: int, gate_type: int} {xxx}_type: the type of the up tensor, corresponding to the ggml type ''' if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"): # legacy branch for loading hybrid model base_key = translate_name_to_gguf(key) # Load experts from safetensor gate_key = f"{base_key}.ffn_gate_exps.weight" gate_type_key = f"{base_key}.ffn_gate_exps.ggml_type" up_key = f"{base_key}.ffn_up_exps.weight" up_type_key = f"{base_key}.ffn_up_exps.ggml_type" down_key = f"{base_key}.ffn_down_exps.weight" down_type_key = f"{base_key}.ffn_down_exps.ggml_type" gate_tensor = self.load_tensor(gate_key, device).numpy() up_tensor = self.load_tensor(up_key, device).numpy() down_tensor = self.load_tensor(down_key, device).numpy() gate_type = self.load_tensor(gate_type_key, device).item() up_type = self.load_tensor(up_type_key, device).item() down_type = self.load_tensor(down_type_key, device).item() return { "up": up_tensor, "gate": gate_tensor, "down": down_tensor, "up_type": up_type, "gate_type": gate_type, "down_type": down_type } else: # Load experts from safetensor base_key = key # e.g. "model.layers.3.mlp.experts" experts_count = 0 # First, count how many experts we have by checking for expert 0's up_proj while self.has_tensor(f"{base_key}.{experts_count}.up_proj.weight"): experts_count += 1 if experts_count == 0: raise ValueError(f"No experts found for key {base_key}") # Initialize empty lists to store tensors for each projection type up_projs = [] gate_projs = [] down_projs = [] # Load all expert weights for expert_id in range(experts_count): up_key = f"{base_key}.{expert_id}.up_proj.weight" gate_key = f"{base_key}.{expert_id}.gate_proj.weight" down_key = f"{base_key}.{expert_id}.down_proj.weight" up_tensor = self.load_tensor(up_key, device) gate_tensor = self.load_tensor(gate_key, device) down_tensor = self.load_tensor(down_key, device) up_projs.append(up_tensor) gate_projs.append(gate_tensor) down_projs.append(down_tensor) # Stack the tensors along a new dimension up_tensor = torch.stack(up_projs, dim=0) gate_tensor = torch.stack(gate_projs, dim=0) down_tensor = torch.stack(down_projs, dim=0) # Get original dtype for GGML type determination orig_up_dtype = up_tensor.dtype orig_gate_dtype = gate_tensor.dtype orig_down_dtype = down_tensor.dtype # Convert to numpy with proper bfloat16 support up_numpy = up_tensor.view(torch.uint16).numpy() gate_numpy = gate_tensor.view(torch.uint16).numpy() down_numpy = down_tensor.view(torch.uint16).numpy() # Determine tensor data types for GGML conversion def get_ggml_type(dtype): if dtype == torch.float32: return GGMLQuantizationType.F32 elif dtype == torch.float16: return GGMLQuantizationType.F16 elif dtype == torch.bfloat16: return GGMLQuantizationType.BF16 else: raise ValueError(f"Unsupported tensor dtype: {dtype}") return { "up": up_numpy, "gate": gate_numpy, "down": down_numpy, "up_type": get_ggml_type(orig_up_dtype), "gate_type": get_ggml_type(orig_gate_dtype), "down_type": get_ggml_type(orig_down_dtype) } def load_gate(self, key: str, device: str="cpu"): ''' Load gate from safetensor key: the name of the gate device: the device to load the gate to return: dict, {'weight': tensor, 'e_score_correction_bias': tensor} ''' target = ["weight", "e_score_correction_bias"] res = {'weight': None, 'e_score_correction_bias': None} if self.has_tensor(translate_name_to_gguf(key)+".ffn_gate_exps.weight"): # legacy branch for loading hybrid model base_key = key for k in target: translated_key = translate_name_to_gguf(f"{base_key}.{k}") if self.has_tensor(translated_key): tensor = self.load_tensor(translated_key, device) res[k] = tensor else: # Load gate from safetensor base_key = key for k in target: if self.has_tensor(f"{base_key}.{k}"): tensor = self.load_tensor(f"{base_key}.{k}", device) res[k] = tensor return res def close_all_handles(self): for handle in self.file_handle_map.values(): handle.close() self.file_handle_map.clear() def load_dequantized_tensor(self, key:str, device: str="cpu"): if key in self.tensor_file_map and translate_name_to_gguf(key): pass elif translate_name_to_gguf(key) in self.tensor_file_map: key = translate_name_to_gguf(key) else: raise KeyError(f"Key {key} not found in Safetensor files") file = self.tensor_file_map[key] f = self.file_handle_map.get(file) if f is None: raise FileNotFoundError(f"File {file} not found in Safetensor files") tensor = f.get_tensor(key).to(device) if key.endswith(".weight"): if key[:-7] + ".weight_scale_inv" in self.tensor_file_map: weight_scale_inv = f.get_tensor(key[:-7] + ".weight_scale_inv").to(device) tensor = weight_dequant(tensor, weight_scale_inv) return tensor.to(device) def has_tensor(self, name: str): return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map class GGUFLoader(ModelLoader): tensor_info: dict gguf_path: str tensor_file_map: dict # {tensor_name: tensor_file_path} gguf_file_meta: dict safetensor_loader: SafeTensorLoader def __init__(self, gguf_path: str): # Check dir exist if not os.path.exists(gguf_path): raise FileNotFoundError(f"GGUF dir not found: {gguf_path}") if os.path.isfile(gguf_path): gguf_path = os.path.dirname(gguf_path) self.safetensor_loader = None self.tensor_info = {} self.gguf_path = gguf_path self.tensor_file_map = {} self.file_data_map = {} self.gguf_file_meta = {} self.tensor_device_map = {} # Walk through all the .gguf files in the directory found_gguf = False for root, dirs, files in os.walk(gguf_path): for file in files: if file.endswith(".gguf"): found_gguf = True file_name = os.path.join(root, file) with open(file_name, "rb") as f: self.load_gguf(f) if file_name not in self.file_data_map: self.file_data_map[file_name] = np.memmap(file_name, mode = 'r') if not found_gguf: raise FileNotFoundError(f"Cannot find any .gguf files in: {gguf_path}") def load_gguf(self, f): f.seek(0) assert f.read(4) == b'GGUF' values = struct.unpack("torch.Tensor: name = translate_name_to_gguf(name) t = self.tensor_info[name] shape = t["shape"] ggml_type = t["ggml_type"] if ggml_type not in GGML_NAMES: raise NotImplementedError(f"ggml_type {ggml_type} not implemented") ggml_name = GGML_NAMES[ggml_type] # TODO: experts may fused in quant block, split it assert elements_per_expert % GGML_ELEMENTS_PER_BLOCK[ggml_name] == 0, "experts may fused in quant block, please use CPU dequant" blocks_per_experts = elements_per_expert // GGML_ELEMENTS_PER_BLOCK[ggml_name] block_size = GGML_BLOCK_SIZES[ggml_name] offset = expert_id * block_size * blocks_per_experts data = data[offset: offset + block_size * blocks_per_experts] if "cuda" in device.lower(): values = GGML_DEQUANTIZE_GPU[ggml_name](data, device, target_dtype) else: values = GGML_DEQUANTIZE[ggml_name](data) values = torch.from_numpy(values.copy()) if ggml_name == "BF16": values = values.view(torch.bfloat16) values = values.view(shape[-2::-1]) return values def load_gguf_tensor(self, name: str, device:str = "cpu", target_dtype = None)->torch.Tensor: name = translate_name_to_gguf(name) t = self.tensor_info[name] if target_dtype == None: target_dtype = torch.get_default_dtype() shape = t["shape"] ggml_type = t["ggml_type"] if ggml_type not in GGML_NAMES: raise NotImplementedError(f"ggml_type {ggml_type} not implemented") ggml_name = GGML_NAMES[ggml_type] data = self.get_mmap_tensor(name) block_size = GGML_BLOCK_SIZES[ggml_name] elements_per_block = GGML_ELEMENTS_PER_BLOCK[ggml_name] num_elements = int(np.prod(shape)) num_blocks = num_elements // elements_per_block blocks_per_iter = 16384 if num_blocks > blocks_per_iter: # dequant large tensor values = torch.empty((num_blocks, elements_per_block), dtype=target_dtype, device=device) for i in range( (num_blocks + blocks_per_iter - 1) // blocks_per_iter): blocks_begin = i * blocks_per_iter blocks_end = min(blocks_begin + blocks_per_iter, num_blocks) if "cuda" in device.lower(): cur_values = GGML_DEQUANTIZE_GPU[ggml_name](data[blocks_begin*block_size : blocks_end*block_size], device, target_dtype) else: cur_values = GGML_DEQUANTIZE[ggml_name](data[blocks_begin*block_size : blocks_end*block_size]) cur_values = torch.from_numpy(cur_values.copy()) cur_values = cur_values.view(-1, elements_per_block) if ggml_name == "BF16": cur_values = cur_values.view(torch.bfloat16) values[blocks_begin : blocks_end] = cur_values else: if "cuda" in device.lower(): values = GGML_DEQUANTIZE_GPU[ggml_name](data, device) else: values = GGML_DEQUANTIZE[ggml_name](data) values = torch.from_numpy(values).to(device) if ggml_name == "BF16": values = values.view(torch.bfloat16) values = values.view(shape[::-1]) if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: n_head = self.gguf_file_meta['llama.attention.head_count'] values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) .swapaxes(1, 2) .reshape(values.shape)) elif "attn_k" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: n_head = self.gguf_file_meta['llama.attention.head_count_kv'] values = (values.reshape(n_head, values.shape[0] // n_head // 2, 2, *values.shape[1:]) .swapaxes(1, 2) .reshape(values.shape)) return values def has_tensor(self, name: str): name = translate_name_to_gguf(name) return name in self.tensor_info def get_ggml_type(self, name: str): name = translate_name_to_gguf(name) if name not in self.tensor_info: raise KeyError(f"Key {name} not found in GGUF files") return self.tensor_info[name]["ggml_type"] class ModelLoaderFactory: """ Factory class for creating model loaders. Automatically detects the model format based on file extensions in the directory. """ @staticmethod def create_loader(path: str): """ Create a model loader for the given path by detecting the model format. The function checks for the presence of .safetensors or .gguf files in the specified path and creates the appropriate loader. Args: path: Path to the model directory or file Returns: An appropriate ModelLoader instance (SafeTensorLoader or GGUFLoader) Raises: FileNotFoundError: If no supported model files are found in the path """ if not os.path.exists(path): raise FileNotFoundError(f"Path not found: {path}") # Normalize to directory path if a file was provided if os.path.isfile(path): if path.endswith(".safetensors"): return SafeTensorLoader(path) elif path.endswith(".gguf"): return GGUFLoader(path) else: folder_path = os.path.dirname(path) else: folder_path = path # Check for safetensors files has_safetensors = False has_gguf = False for root, _, files in os.walk(folder_path): for file in files: if file.endswith(".safetensors"): has_safetensors = True break elif file.endswith(".gguf"): has_gguf = True break if has_safetensors or has_gguf: break # Create the appropriate loader based on detected file types # Prioritize SafeTensor over GGUF if both are present if has_safetensors: try: return SafeTensorLoader(folder_path) except Exception as e: print(f"Failed to create SafeTensorLoader: {e}") # Fall through to try GGUF if SafeTensor fails if not has_gguf: raise if has_gguf: try: return GGUFLoader(folder_path) except Exception as e: print(f"Failed to create GGUFLoader: {e}") raise # No supported model files found raise FileNotFoundError(f"No .safetensors or .gguf files found in: {folder_path}")