Merge branch 'upstream' into concedo_experimental

# Conflicts: # CMakePresets.json # ggml/src/ggml-cuda/CMakeLists.txt # tests/test-sampling.cpp # tools/mtmd/clip.cpp
2025-09-11 01:24:36 +00:00 · 2025-05-07 19:47:44 +08:00 · 2025-05-07 19:47:44 +08:00 · 38b3bffcef
commit 38b3bffcef
parent 4e97b69657 141a908a59
3 changed files with 1369 additions and 1316 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -3915,6 +3915,16 @@ class Gemma3VisionModel(VisionModel):
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
        # calculate proj_scale_factor (used by tinygemma3 test model)
        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
        n_per_side = int(image_seq_length ** 0.5)
        image_size = self.hparams["image_size"]
        patch_size = self.hparams["patch_size"]
        proj_scale_factor = (image_size // patch_size) // n_per_side
        if proj_scale_factor > 0 and proj_scale_factor != 4:
            # we only need to write this if it's not the default value
            # in this case, we are converting a test model
            self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
    def tensor_force_quant(self, name, new_name, bid, n_dims):
        del bid, new_name, n_dims  # unused
@ -3928,6 +3938,9 @@ class Gemma3VisionModel(VisionModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        if "vision_model.head." in name:
            return [] # skip redundant tensors for tinygemma3
        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
            # process vision tensors
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1750,27 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
 static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
-    if (ctx->n < 0.0f) {
+    if (ctx->n <= 0.0f || cur_p->size <= 1) {
        return;
    }
    // find max logit and calculate mean
    float max = cur_p->data[0].logit;
    float logits_sum = 0;
    size_t valid_count = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].logit > max) {
+        // Only count non-negative infinity values
-            max = cur_p->data[i].logit;
+        if (cur_p->data[i].logit != -INFINITY) {
            if (cur_p->data[i].logit > max) {
                max = cur_p->data[i].logit;
            }
            logits_sum += cur_p->data[i].logit;
            valid_count++;
        }
        logits_sum += cur_p->data[i].logit;
    }
-    float mean = logits_sum/cur_p->size;
+    float mean = valid_count > 0 ? logits_sum/valid_count : 0;
    // calculate standard deviation
    float acc = 0;
    for (size_t i = 0; i < cur_p->size; ++i) {
-        acc += pow(cur_p->data[i].logit - mean, 2);
+        // Skip -infinity in std calculation
        if (cur_p->data[i].logit != -INFINITY) {
            acc += pow(cur_p->data[i].logit - mean, 2);
        }
    }
-    float std = sqrt(acc/cur_p->size);
+    float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
    //apply mask
    for (size_t i = 0; i < cur_p->size; ++i) {
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp