diff --git a/common/download.cpp b/common/download.cpp index 06e91fb3f..7bf2ac588 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -552,6 +552,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, return best; } +static bool gguf_filename_is_model(const std::string & filepath) { + if (!string_ends_with(filepath, ".gguf")) { + return false; + } + + std::string filename = filepath; + if (auto pos = filename.rfind('/'); pos != std::string::npos) { + filename = filename.substr(pos + 1); + } + + return filename.find("mmproj") == std::string::npos && + filename.find("imatrix") == std::string::npos; +} + static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, const std::string & tag) { std::vector tags; @@ -565,8 +579,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, for (const auto & t : tags) { std::regex pattern(t + "[.-]", std::regex::icase); for (const auto & f : files) { - if (string_ends_with(f.path, ".gguf") && - f.path.find("mmproj") == std::string::npos && + if (gguf_filename_is_model(f.path) && std::regex_search(f.path, pattern)) { return f; } @@ -574,8 +587,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, } for (const auto & f : files) { - if (string_ends_with(f.path, ".gguf") && - f.path.find("mmproj") == std::string::npos) { + if (gguf_filename_is_model(f.path)) { return f; } } diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 6645aceb2..80dcab017 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -38,6 +38,7 @@ static fs::path get_cache_directory() { const char * var; fs::path path; } entries[] = { + {"LLAMA_CACHE", fs::path()}, {"HF_HUB_CACHE", fs::path()}, {"HUGGINGFACE_HUB_CACHE", fs::path()}, {"HF_HOME", fs::path("hub")}, @@ -325,9 +326,15 @@ hf_files get_repo_files(const std::string & repo_id, if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { file.oid = item["lfs"]["oid"].get(); } + if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { + file.size = item["lfs"]["size"].get(); + } } else if (item.contains("oid") && item["oid"].is_string()) { file.oid = item["oid"].get(); } + if (file.size == 0 && item.contains("size") && item["size"].is_number()) { + file.size = item["size"].get(); + } if (!file.oid.empty() && !is_valid_oid(file.oid)) { LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); @@ -487,6 +494,34 @@ std::string finalize_file(const hf_file & file) { // delete everything after this line, one day +// copied from download.cpp without the tag part +struct gguf_split_info { + std::string prefix; // tag included + int index; + int count; +}; + +static gguf_split_info get_gguf_split_info(const std::string & path) { + static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); + std::smatch m; + + std::string prefix = path; + if (!string_remove_suffix(prefix, ".gguf")) { + return {}; + } + + int index = 1; + int count = 1; + + if (std::regex_match(prefix, m, re_split)) { + index = std::stoi(m[2].str()); + count = std::stoi(m[3].str()); + prefix = m[1].str(); + } + + return {std::move(prefix), index, count}; +} + static std::pair parse_manifest_name(std::string & filename) { static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); std::smatch match; @@ -504,25 +539,30 @@ static std::string make_old_cache_filename(const std::string & owner, return result; } -static void migrate_single_file(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const nl::json & node, - const hf_files & files) { +struct migrate_file { + std::string path; + std::string sha256; + size_t size; + fs::path old_path; + fs::path etag_path; + const hf_file * file; +}; - if (!node.contains("rfilename") || - !node.contains("lfs") || - !node["lfs"].contains("sha256")) { - return; - } +using migrate_files = std::vector; - std::string path = node["rfilename"]; - std::string sha256 = node["lfs"]["sha256"]; +static bool collect_file(const fs::path & old_cache, + const std::string & owner, + const std::string & repo, + const std::string & path, + const std::string & sha256, + const hf_files & files, + migrate_files & to_migrate) { + + const hf_file * file = nullptr; - const hf_file * file_info = nullptr; for (const auto & f : files) { if (f.path == path) { - file_info = &f; + file = &f; break; } } @@ -532,41 +572,105 @@ static void migrate_single_file(const fs::path & old_cache, fs::path etag_path = old_path.string() + ".etag"; if (!fs::exists(old_path)) { - if (fs::exists(etag_path)) { - LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str()); - fs::remove(etag_path); + if (file && fs::exists(file->final_path)) { + return true; } - return; + LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); + return false; } - if (!file_info) { - LOG_WRN("%s: %s not found in current repo, ignoring...\n", __func__, old_filename.c_str()); - return; - } else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch), ignoring...\n", __func__, old_filename.c_str()); - return; + if (!file) { + LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); + return false; } + if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { + LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); + return false; + } + + if (file->size > 0) { + size_t size = fs::file_size(old_path); + if (size != file->size) { + LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); + return false; + } + } + + to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); + return true; +} + +static bool collect_files(const fs::path & old_cache, + const std::string & owner, + const std::string & repo, + const nl::json & node, + const hf_files & files, + migrate_files & to_migrate) { + + if (!node.contains("rfilename") || + !node.contains("lfs") || + !node["lfs"].contains("sha256")) { + return true; + } + + std::string path = node["rfilename"]; + std::string sha256 = node["lfs"]["sha256"]; + + auto split = get_gguf_split_info(path); + + if (split.count <= 1) { + return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); + } + + std::vector> splits; + + for (const auto & f : files) { + auto split_f = get_gguf_split_info(f.path); + if (split_f.count == split.count && split_f.prefix == split.prefix) { + // sadly the manifest only provides the sha256 of the first file (index == 1) + // the rest will be verified using the size... + std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; + splits.emplace_back(f.path, f_sha256); + } + } + + if ((int)splits.size() != split.count) { + LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); + return false; + } + + for (const auto & [f_path, f_sha256] : splits) { + if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { + return false; + } + } + + return true; +} + +static bool migrate_file(const migrate_file & file) { std::error_code ec; - fs::path new_path(file_info->local_path); + fs::path new_path(file.file->local_path); fs::create_directories(new_path.parent_path(), ec); if (!fs::exists(new_path, ec)) { - fs::rename(old_path, new_path, ec); + fs::rename(file.old_path, new_path, ec); if (ec) { - fs::copy_file(old_path, new_path, ec); + fs::copy_file(file.old_path, new_path, ec); if (ec) { - LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str()); - return; + LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); + return false; } } - fs::remove(old_path, ec); + fs::remove(file.old_path, ec); } - fs::remove(etag_path, ec); + fs::remove(file.etag_path, ec); - std::string filename = finalize_file(*file_info); - LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str()); + std::string filename = finalize_file(*file.file); + LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); + return true; } void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { @@ -614,19 +718,43 @@ void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { continue; } + migrate_files to_migrate; + bool ok = true; + try { std::ifstream manifest(entry.path()); auto json = nl::json::parse(manifest); - for (const char * key : {"ggufFile", "mmprojFile"}) { if (json.contains(key)) { - migrate_single_file(old_cache, owner, repo, json[key], files); + if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { + ok = false; + break; + } } } } catch (const std::exception & e) { LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); continue; } + + if (!ok) { + LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); + continue; + } + + for (const auto & file : to_migrate) { + if (!migrate_file(file)) { + ok = false; + break; + } + } + + if (!ok) { + LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); + continue; + } + + LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); fs::remove(entry.path()); } } diff --git a/common/hf-cache.h b/common/hf-cache.h index ee2e98494..9e46f9774 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -14,6 +14,7 @@ struct hf_file { std::string final_path; std::string oid; std::string repo_id; + size_t size = 0; // only for the migration }; using hf_files = std::vector; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f377738f8..82d1004c6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -486,7 +486,7 @@ class ModelBase: elif quant_method == "modelopt": # Mixed-precision ModelOpt models: NVFP4 tensors are handled by # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and - # are dequantized here. input_scale tensors are unused. + # are dequantized here. k/v scale tensors are unused. for name in self.model_tensors.keys(): if name.endswith(".weight_scale"): weight_name = name.removesuffix("_scale") @@ -494,7 +494,7 @@ class ModelBase: s = self.model_tensors[name] self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) tensors_to_remove.append(name) - if name.endswith((".input_scale", ".k_scale", ".v_scale")): + if name.endswith((".k_scale", ".v_scale")): tensors_to_remove.append(name) elif quant_method is not None: raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") @@ -542,7 +542,6 @@ class ModelBase: raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) # Handle gate/up expert tensor fusion if enabled @@ -607,7 +606,12 @@ class ModelBase: def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool: return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6 - def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor): + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + if "language_model." in name: + name = name.replace("language_model.", "") + + new_name = self.map_tensor_name(name) + raw, shape = self._nvfp4_pack(weight, scale) logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) @@ -619,10 +623,18 @@ class ModelBase: logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])") self.gguf_writer.add_tensor(scale_name, scale2_f32) + # Emit per-tensor input_scale as a separate F32 tensor when non-trivial + if not self._nvfp4_scale2_is_trivial(input_scale): + input_scale_f32 = input_scale.float().numpy().flatten() + input_scale_name = new_name.replace(".weight", ".input_scale") + logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])") + self.gguf_writer.add_tensor(input_scale_name, input_scale_f32) + def _generate_nvfp4_tensors(self): # Per-layer expert merging to avoid holding all experts in memory expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {} expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} + expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} expert_shapes: dict[tuple[int, str], list[int]] = {} n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 consumed: list[str] = [] @@ -632,6 +644,7 @@ class ModelBase: continue scale_name = name.replace(".weight", ".weight_scale") scale2_name = name.replace(".weight", ".weight_scale_2") + input_scale_name = name.replace(".weight", ".input_scale") if scale_name not in self.model_tensors: continue # Force eager materialization of lazy tensors @@ -643,11 +656,14 @@ class ModelBase: continue scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) + input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))()) # Mark tensors for removal from model_tensors (already written to gguf) consumed.extend([name, scale_name]) if scale2_name in self.model_tensors: consumed.append(scale2_name) + if input_scale_name in self.model_tensors: + consumed.append(input_scale_name) # Check if this is a per-expert tensor m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) @@ -663,34 +679,37 @@ class ModelBase: if key not in expert_blocks: expert_blocks[key] = [] expert_scales[key] = [] + expert_input_scales[key] = [] expert_shapes[key] = shape expert_blocks[key].append((expert_id, raw.copy())) # Collect per-expert scale2 (scalar per expert) expert_scales[key].append((expert_id, float(scale2.float().sum()))) + # Collect per-expert input_scale (scalar per expert) + expert_input_scales[key].append((expert_id, float(input_scale.float().sum()))) # Flush when all experts for this (layer, proj) are collected if n_experts > 0 and len(expert_blocks[key]) >= n_experts: - self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type) + self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) else: - new_name = self.map_tensor_name(name) - self._repack_nvfp4(new_name, weight, scale, scale2) + self._repack_nvfp4(name, weight, scale, scale2, input_scale) # Flush any remaining experts (fallback if n_experts was unknown) for (bid, proj_type) in list(expert_blocks.keys()): - self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type) + self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) # Remove consumed tensors so get_tensors/modify_tensors won't see them for name in consumed: self.model_tensors.pop(name, None) - # Remove unused auxiliary tensors (input_scale, k_scale, v_scale) + # Remove any remaining unused auxiliary tensors for name in list(self.model_tensors.keys()): - if name.endswith((".input_scale", ".k_scale", ".v_scale")): + if name.endswith((".k_scale", ".v_scale")): del self.model_tensors[name] - def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type): + def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type): experts = expert_blocks.pop(key) scales = expert_scales.pop(key) + input_scales = expert_input_scales.pop(key) shape = expert_shapes.pop(key) experts.sort(key=lambda x: x[0]) @@ -708,6 +727,14 @@ class ModelBase: logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])") self.gguf_writer.add_tensor(scale_name, scale_vals) + # Emit per-expert input_scale tensor if any expert has non-trivial input_scale + input_scales.sort(key=lambda x: x[0]) + input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32) + if not np.allclose(input_scale_vals, 1.0, atol=1e-6): + input_scale_name = new_name.replace(".weight", ".input_scale") + logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])") + self.gguf_writer.add_tensor(input_scale_name, input_scale_vals) + del experts, merged def prepare_tensors(self): @@ -1311,6 +1338,9 @@ class TextModel(ModelBase): if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": # ref: https://huggingface.co/aari1995/German_Semantic_V3 res = "jina-v2-de" + if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": + # ref: https://huggingface.co/evilfreelancer/ruGPT3XL + res = "gpt-2" if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -5011,6 +5041,97 @@ class _LinearAttentionVReorderBase(Qwen3NextModel): perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim] return tensor.permute(*perm).contiguous().reshape(*shape) + def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]: + if not name.endswith(( + ".linear_attn.in_proj_qkv.weight", + ".linear_attn.in_proj_z.weight", + ".linear_attn.in_proj_a.weight", + ".linear_attn.in_proj_b.weight", + ".linear_attn.out_proj.weight", + )): + return weight, scale + + num_k_heads = self.hparams["linear_num_key_heads"] + num_v_heads = self.hparams["linear_num_value_heads"] + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_per_k = num_v_heads // num_k_heads + + def unpack_nibbles(qs: Tensor) -> Tensor: + lo = torch.bitwise_and(qs, 0x0F) + hi = torch.bitwise_right_shift(qs, 4) + return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2) + + def pack_nibbles(codes: Tensor) -> Tensor: + codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2) + lo = torch.bitwise_and(codes[..., 0], 0x0F) + hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4) + return torch.bitwise_or(lo, hi).contiguous() + + def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]: + assert qs.ndim >= 2 + assert scales.ndim >= 2 + + k = qs.shape[-1] * 2 + assert col_perm.numel() == k + assert k % 16 == 0 + + group_cols = col_perm.reshape(-1, 16) + group_starts = group_cols[:, 0] + expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype) + assert torch.equal(group_cols, expected) + assert torch.all(group_starts % 16 == 0) + + group_perm = (group_starts // 16).to(dtype=torch.long) + expected_groups = torch.arange(scales.shape[-1], dtype=torch.long) + assert group_perm.numel() == scales.shape[-1] + assert torch.equal(torch.sort(group_perm).values, expected_groups) + + codes = unpack_nibbles(qs) + codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long)) + qs = pack_nibbles(codes) + scales = scales.index_select(-1, group_perm.to(device=scales.device)) + return qs, scales + + def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]: + row_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1), + 0, num_k_heads, num_v_per_k, head_dim, + ).squeeze(-1) + return ( + qs.index_select(0, row_perm.to(device=qs.device)), + scales.index_select(0, row_perm.to(device=scales.device)), + ) + + if name.endswith(".linear_attn.in_proj_qkv.weight"): + q_dim = head_k_dim * num_k_heads + k_dim = head_k_dim * num_k_heads + q = weight[:q_dim] + k = weight[q_dim:q_dim + k_dim] + v = weight[q_dim + k_dim:] + q_scale = scale[:q_dim] + k_scale = scale[q_dim:q_dim + k_dim] + v_scale = scale[q_dim + k_dim:] + v, v_scale = reorder_rows(v, v_scale, head_v_dim) + return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0) + + if name.endswith(".linear_attn.in_proj_z.weight"): + weight, scale = reorder_rows(weight, scale, head_v_dim) + elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")): + weight, scale = reorder_rows(weight, scale, 1) + elif name.endswith(".linear_attn.out_proj.weight"): + col_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0), + 1, num_k_heads, num_v_per_k, head_v_dim, + ).squeeze(0) + weight, scale = apply_col_perm(weight, scale, col_perm) + + return weight, scale + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + weight, scale = self._transform_nvfp4_weight(name, weight, scale) + super()._repack_nvfp4(name, weight, scale, scale2, input_scale) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_k_heads = self.hparams.get("linear_num_key_heads", 0) num_v_heads = self.hparams.get("linear_num_value_heads", 0) @@ -5100,6 +5221,47 @@ class GPT2Model(TextModel): yield from super().modify_tensors(data_torch, new_name, bid) +@ModelBase.register("RuGPT3XLForCausalLM") +class RuGPT3XLModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + _qkv_parts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Fuse separate Q, K, V projections into a single QKV tensor + if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: + suffix = "weight" if name.endswith(".weight") else "bias" + part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") + key = f"{part}.{suffix}" + + assert bid is not None + if self._qkv_parts is None: + self._qkv_parts = [{} for _ in range(self.block_count)] + self._qkv_parts[bid][key] = data_torch + + q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" + if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): + q = self._qkv_parts[bid].pop(q_key) + k = self._qkv_parts[bid].pop(k_key) + v = self._qkv_parts[bid].pop(v_key) + data_torch = torch.cat([q, k, v], dim=0) + name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") + logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._qkv_parts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] + if len(parts) > 0: + raise ValueError(f"Unprocessed Q/K/V parts: {parts}") + + @ModelBase.register("PhiForCausalLM") class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 1e8b29fb2..086f1c228 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -178,6 +178,7 @@ pre_computed_hashes = [ {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, # jina-v2-de variants {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"}, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"}, ] diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 86e91b84d..abb5957e9 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -806,6 +806,16 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #endif // CUDART_VERSION >= 12050 } +static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) { +#ifdef FP8_AVAILABLE + const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation. + const __nv_fp8_e4m3 xf = *reinterpret_cast(&bits); + return static_cast(xf) / 2; +#else + NO_DEVICE_CODE; +#endif // FP8_AVAILABLE +} + __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) { const uint8_t sign_bit = (x < 0.0f) << 3; float ax = fabsf(x) * e; @@ -938,6 +948,13 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI_MXFP4; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_NVFP4; + static constexpr int qr = QR_NVFP4; + static constexpr int qi = QI_NVFP4; +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index b70492c7d..79ccfe568 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -617,6 +617,45 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_mxfp4<<>>(vx, y); } +template +static __global__ void dequantize_block_nvfp4( + const void * __restrict__ vx, + dst_t * __restrict__ yy, + const int64_t ne) { + const int64_t i = blockIdx.x; + const int tid = threadIdx.x; + + const int64_t base = i * QK_NVFP4; + if (base >= ne) { + return; + } + + const block_nvfp4 * x = (const block_nvfp4 *) vx; + const block_nvfp4 & xb = x[i]; + + const int sub = tid / (QK_NVFP4_SUB / 2); + const int j = tid % (QK_NVFP4_SUB / 2); + + const float d = ggml_cuda_ue4m3_to_fp32(xb.d[sub]); + const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j]; + + const int64_t y0 = base + sub * QK_NVFP4_SUB + j; + const int64_t y1 = y0 + QK_NVFP4_SUB / 2; + + yy[y0] = ggml_cuda_cast(d * kvalues_mxfp4[q & 0x0F]); + yy[y1] = ggml_cuda_cast(d * kvalues_mxfp4[q >> 4]); +} + +template +static void dequantize_row_nvfp4_cuda( + const void * vx, + dst_t * y, + const int64_t k, + cudaStream_t stream) { + GGML_ASSERT(k % QK_NVFP4 == 0); + const int nb = k / QK_NVFP4; + dequantize_block_nvfp4<<>>(vx, y, k); +} template static __global__ void convert_unary( const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, @@ -715,6 +754,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq3_s_cuda; case GGML_TYPE_MXFP4: return dequantize_row_mxfp4_cuda; + case GGML_TYPE_NVFP4: + return dequantize_row_nvfp4_cuda; case GGML_TYPE_F32: return convert_unary_cont_cuda; case GGML_TYPE_BF16: @@ -766,6 +807,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq3_s_cuda; case GGML_TYPE_MXFP4: return dequantize_row_mxfp4_cuda; + case GGML_TYPE_NVFP4: + return dequantize_row_nvfp4_cuda; case GGML_TYPE_F16: return convert_unary_cont_cuda; case GGML_TYPE_BF16: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 68470bfcb..a389767fb 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1297,7 +1297,12 @@ static void ggml_cuda_op_mul_mat_cublas( const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) || (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); - const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; + const bool use_fp16 = + src0->type != GGML_TYPE_NVFP4 && + (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + ggml_is_contiguous(src0) && + row_diff == src0->ne[1] && + dst->op_params[0] == GGML_PREC_DEFAULT; if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); @@ -4802,6 +4807,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_MXFP4: +#ifdef FP8_AVAILABLE + case GGML_TYPE_NVFP4: +#endif // FP8_AVAILABLE case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 024b3d8cf..66bd8beea 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -15,6 +15,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1; case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1; case GGML_TYPE_MXFP4: return vec_dot_mxfp4_q8_1; + case GGML_TYPE_NVFP4: return vec_dot_nvfp4_q8_1; case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1; case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1; case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; @@ -41,6 +42,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) { case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ; case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ; case GGML_TYPE_MXFP4: return VDR_MXFP4_Q8_1_MMVQ; + case GGML_TYPE_NVFP4: return VDR_NVFP4_Q8_1_MMVQ; case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ; case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ; case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; @@ -626,6 +628,12 @@ static void mul_mat_vec_q_switch_type( nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; + case GGML_TYPE_NVFP4: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; case GGML_TYPE_Q2_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index ab803aca2..40b2b41e7 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -322,6 +322,38 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1( return d * sumi; } +#define VDR_NVFP4_Q8_1_MMVQ 4 +#define VDR_NVFP4_Q8_1_MMQ 8 + +static __device__ __forceinline__ float vec_dot_nvfp4_q8_1( + const void * __restrict__ vbq, + const block_q8_1 * __restrict__ bq8_1, + const int32_t & kbx, + const int32_t & iqs) { + + const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx; + float sum = 0.0f; +#pragma unroll + for (int i = 0; i < VDR_NVFP4_Q8_1_MMVQ/2; i++) { + const int32_t iqs0 = iqs + 2*i; + const int32_t iqs1 = iqs0 + 1; + const int32_t is = iqs0 >> 1; + const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4); + const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4); + const block_q8_1 * bq8 = bq8_1 + (is >> 1); + const int32_t i8 = ((is & 1) << 2); + + int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0); + sumi = ggml_cuda_dp4a(v0.y, get_int_b4(bq8->qs, i8 + 2), sumi); + sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi); + sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi); + + const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds); + sum += d * float(sumi); + } + + return sum; +} #define VDR_Q2_K_Q8_1_MMVQ 1 #define VDR_Q2_K_Q8_1_MMQ 4 diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h index ba032cfab..07bc47df3 100644 --- a/ggml/src/ggml-cuda/vendors/cuda.h +++ b/ggml/src/ggml-cuda/vendors/cuda.h @@ -6,9 +6,10 @@ #include #include -#if CUDART_VERSION >= 12050 +#if CUDART_VERSION >= 11080 #include -#endif // CUDART_VERSION >= 12050 +#define FP8_AVAILABLE +#endif // CUDART_VERSION >= 11080 #if CUDART_VERSION >= 12080 #include diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 35d1e1a06..9d9ba1ee2 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -235,6 +235,12 @@ typedef __hip_bfloat16 nv_bfloat16; typedef __hip_bfloat162 nv_bfloat162; +#if HIP_VERSION >= 60200000 +#include +typedef __hip_fp8_e4m3 __nv_fp8_e4m3; +#define FP8_AVAILABLE +#endif // HIP_VERSION >= 60200000 + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 281c1a830..df70577db 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -63,6 +63,7 @@ class TensorNameMap: "transformer.wpe", # gpt2 "embeddings.position_embeddings", # bert "wpe", # gpt2 + "model.embed_positions", # rugpt3xl ), # Output diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ecac2811f..ceae8d408 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7736,6 +7736,65 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + + // input scales + if (!layer.wq_in_s && layer.wq) { + layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wk_in_s && layer.wk) { + layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wv_in_s && layer.wv) { + layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wo_in_s && layer.wo) { + layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wqkv_in_s && layer.wqkv) { + layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.wqkv_gate_in_s && layer.wqkv_gate) { + layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_gate_in_s && layer.ffn_gate) { + layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_down_in_s && layer.ffn_down) { + layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_up_in_s && layer.ffn_up) { + layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) { + layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) { + layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) { + layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) { + layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) { + layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) { + layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_in_in_s && layer.ssm_in) { + layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_out_in_s && layer.ssm_out) { + layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_alpha_in_s && layer.ssm_alpha) { + layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.ssm_beta_in_s && layer.ssm_beta) { + layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } } } diff --git a/src/llama-model.h b/src/llama-model.h index aefcfe700..96ab31cbb 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -414,6 +414,27 @@ struct llama_layer { struct ggml_tensor * ssm_alpha_s = nullptr; struct ggml_tensor * ssm_beta_s = nullptr; + // input scales + struct ggml_tensor * wq_in_s = nullptr; + struct ggml_tensor * wk_in_s = nullptr; + struct ggml_tensor * wv_in_s = nullptr; + struct ggml_tensor * wo_in_s = nullptr; + struct ggml_tensor * wqkv_in_s = nullptr; + struct ggml_tensor * wqkv_gate_in_s = nullptr; + struct ggml_tensor * ffn_gate_in_s = nullptr; + struct ggml_tensor * ffn_up_in_s = nullptr; + struct ggml_tensor * ffn_down_in_s = nullptr; + struct ggml_tensor * ffn_gate_exps_in_s = nullptr; + struct ggml_tensor * ffn_down_exps_in_s = nullptr; + struct ggml_tensor * ffn_up_exps_in_s = nullptr; + struct ggml_tensor * ffn_gate_shexp_in_s= nullptr; + struct ggml_tensor * ffn_up_shexp_in_s = nullptr; + struct ggml_tensor * ffn_down_shexp_in_s= nullptr; + struct ggml_tensor * ssm_in_in_s = nullptr; + struct ggml_tensor * ssm_out_in_s = nullptr; + struct ggml_tensor * ssm_alpha_in_s = nullptr; + struct ggml_tensor * ssm_beta_in_s = nullptr; + // altup & laurel struct ggml_tensor * per_layer_inp_gate = nullptr; struct ggml_tensor * per_layer_proj = nullptr;