Merge branch 'master' into concedo_experimental

# Conflicts: # .github/workflows/build.yml
2025-09-11 01:24:36 +00:00 · 2023-10-07 01:36:14 +08:00 · 2023-10-07 01:36:14 +08:00 · d8f7a7077a
commit d8f7a7077a
parent 120695ddf7 9ca79d5cbb
8 changed files with 242 additions and 111 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -126,6 +126,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
    }
    s = std::move(result);
 }
+
+static bool is_float_close(float a, float b, float abs_tol) {
+    // Check for non-negative tolerance
+    if (abs_tol < 0.0) {
+        throw std::invalid_argument("Tolerance must be non-negative");
+    }
+
+    // Exact equality check
+    if (a == b) {
+        return true;
+    }
+
+    // Check for infinities
+    if (std::isinf(a) || std::isinf(b)) {
+        return false;
+    }
+
+    // Regular comparison using the provided absolute tolerance
+    return std::fabs(b - a) <= abs_tol;
+}
+
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@ -974,7 +995,24 @@ struct llama_hparams {
    float rope_freq_scale_train;

    bool operator!=(const llama_hparams & other) const {
-        return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
+        if (this->vocab_only != other.vocab_only) return true;
+        if (this->n_vocab != other.n_vocab) return true;
+        if (this->n_ctx_train != other.n_ctx_train) return true;
+        if (this->n_embd != other.n_embd) return true;
+        if (this->n_head != other.n_head) return true;
+        if (this->n_head_kv != other.n_head_kv) return true;
+        if (this->n_layer != other.n_layer) return true;
+        if (this->n_rot != other.n_rot) return true;
+        if (this->n_ff != other.n_ff) return true;
+
+        const float EPSILON = 1e-9;
+
+        if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
+        if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
+        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
+
+        return false;
    }

    uint32_t n_gqa() const {
@ -1049,6 +1087,9 @@ struct llama_kv_cell {
 struct llama_kv_cache {
    bool has_shift = false;

+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;

@ -1306,6 +1347,8 @@ static bool llama_kv_cache_init(

 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
+// Note: On success, it's important that cache.head points
+// to the first cell of the slot.
 static bool llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
        const struct llama_batch & batch) {
@ -1321,8 +1364,8 @@ static bool llama_kv_cache_find_slot(

    while (true) {
        if (cache.head + n_tokens > n_ctx) {
+            n_tested += n_ctx - cache.head;
            cache.head = 0;
-            n_tested   += n_ctx - cache.head;
            continue;
        }

@ -1373,6 +1416,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
        cache.cells[i].pos = -1;
        cache.cells[i].seq_id.clear();
    }
+
+    // Searching for a free slot can start here since we know it will be empty.
+    cache.head = uint32_t(c0);
 }

 static void llama_kv_cache_seq_rm(
@ -1380,6 +1426,8 @@ static void llama_kv_cache_seq_rm(
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1) {
+    uint32_t new_head = cache.size;
+
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

@ -1388,9 +1436,13 @@ static void llama_kv_cache_seq_rm(
            cache.cells[i].seq_id.erase(seq_id);
            if (cache.cells[i].seq_id.empty()) {
                cache.cells[i].pos = -1;
+                if (new_head == cache.size) new_head = i;
            }
        }
    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size) cache.head = new_head;
 }

 static void llama_kv_cache_seq_cp(
@ -1402,6 +1454,8 @@ static void llama_kv_cache_seq_cp(
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

+    cache.head = 0;
+
    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
            cache.cells[i].seq_id.insert(seq_id_dst);
@ -1410,12 +1464,18 @@ static void llama_kv_cache_seq_cp(
 }

 static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    uint32_t new_head = cache.size;
+
    for (uint32_t i = 0; i < cache.size; ++i) {
        if (!cache.cells[i].has_seq_id(seq_id)) {
            cache.cells[i].pos = -1;
            cache.cells[i].seq_id.clear();
+            if (new_head == cache.size) new_head = i;
        }
    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cache.size) cache.head = new_head;
 }

 static void llama_kv_cache_seq_shift(
@ -1424,6 +1484,8 @@ static void llama_kv_cache_seq_shift(
                    llama_pos   p0,
                    llama_pos   p1,
                    llama_pos   delta) {
+    uint32_t new_head = cache.size;
+
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

@ -1433,12 +1495,17 @@ static void llama_kv_cache_seq_shift(
            if (cache.cells[i].pos < 0) {
                cache.cells[i].pos = -1;
                cache.cells[i].seq_id.clear();
+                if (new_head == cache.size) new_head = i;
            } else {
                cache.has_shift = true;
                cache.cells[i].delta = delta;
            }
        }
    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    cache.head = new_head != cache.size ? new_head : 0;
 }

 //
@ -4460,10 +4527,6 @@ static int llama_decode_internal(
        batch.seq_id = seq_id.data();
    }

-    // we always start to search for a free slot from the start of the cache
-    // TODO: better strategies can be implemented
-    kv_self.head = 0;
-
    if (!llama_kv_cache_find_slot(kv_self, batch)) {
        return 1;
    }
@ -4549,8 +4612,12 @@ static int llama_decode_internal(
 #endif

    // update the kv ring buffer
-    lctx.kv_self.head      += n_tokens;
    lctx.kv_self.has_shift  = false;
+    lctx.kv_self.head      += n_tokens;
+    // Ensure kv cache head points to a valid index.
+    if (lctx.kv_self.head >= lctx.kv_self.size) {
+        lctx.kv_self.head = 0;
+    }

 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
@ -8190,14 +8257,14 @@ void llama_print_timings(struct llama_context * ctx) {
    const llama_timings timings = llama_get_timings(ctx);

    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
+    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
 }

 void llama_reset_timings(struct llama_context * ctx) {