Merge branch 'master' into concedo_experimental

# Conflicts: # Makefile # README.md # common/log.h
2025-09-11 09:34:37 +00:00 · 2023-09-02 11:24:28 +08:00 · 2023-09-02 11:24:28 +08:00 · eed651494e
commit eed651494e
parent 8df03ed026 571083f508
26 changed files with 1143 additions and 658 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3610,7 +3610,7 @@ static void llama_grammar_advance_stack(
        std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {

    if (stack.empty()) {
-        new_stacks.push_back(stack);
+        new_stacks.emplace_back(stack);
        return;
    }

@ -3647,7 +3647,7 @@ static void llama_grammar_advance_stack(
        }
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
-            new_stacks.push_back(stack);
+            new_stacks.emplace_back(stack);
            break;
        default:
            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -4406,7 +4406,7 @@ struct llama_logit_info {
        }
        return min_heap;
    }
-    float probability_from_logit(float logit) {
+    float probability_from_logit(float logit) const {
        return normalizer * std::exp(logit - max_l);
    }
 };
@ -4696,6 +4696,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    llm_load_arch(*ml, model);
    llm_load_hparams(*ml, model, 0, 0, 0);

+    if (params->only_copy) {
+        ftype = model.ftype;
+    }
+
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();

@ -4782,18 +4786,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // quantize only 2D tensors
        quantize &= (tensor->n_dims == 2);
        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= quantized_type != tensor->type;
+        quantize &= !params->only_copy;

        enum ggml_type new_type;
        void * new_data;
        size_t new_size;

-        if (!quantize) {
-            new_type = tensor->type;
-            new_data = tensor->data;
-            new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
-        } else {
+        if (quantize) {
            new_type = quantized_type;
 #ifdef GGML_USE_K_QUANTS
            // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -4892,7 +4891,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                }
            }
 #endif
-
+            // If we've decided to quantize to the same type the tensor is already
+            // in then there's nothing to do.
+            quantize = tensor->type != new_type;
+        }
+        if (!quantize) {
+            new_type = tensor->type;
+            new_data = tensor->data;
+            new_size = ggml_nbytes(tensor);
+            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+        } else {
            const size_t nelements = ggml_nelements(tensor);

            float * f32_data;
@ -5323,6 +5331,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
+        /*.only_copy                   =*/ false,
    };

    return result;