fix context shifting

2025-09-10 23:54:34 +00:00 · 2025-05-19 16:58:35 +04:00 · 2025-05-19 16:58:35 +04:00 · c54a6a0132
commit c54a6a0132
parent 07c4966a80
8 changed files with 397 additions and 73 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -283,7 +283,7 @@ extern "C" {
        uint32_t n_world; // number of nodes
        uint32_t rank; // my node rank
        uint32_t n_layer_window[32]; // number of layers to kept each time
-        int32_t n_gpu_layers; // number of layers to store in VRAM
+        int32_t  n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

        // main_gpu interpretation depends on split_mode:
@ -707,8 +707,10 @@ extern "C" {
    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);

    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
+    LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx);
+
+    // Notify other devices to clear their KV cache
+    LLAMA_API void llama_send_kv_cache_clear(struct llama_context * ctx);

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@ -720,6 +722,13 @@ extern "C" {
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1);
+    
+    // Notify other nodes to remove a range from their KV cache
+    LLAMA_API void llama_send_kv_cache_seq_rm(
+            struct llama_context * ctx, 
+                    llama_seq_id   seq_id, 
+                       llama_pos   p0, 
+                       llama_pos   p1);

    // Copy all tokens that belong to the specified sequence to another sequence
    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
@ -731,6 +740,14 @@ extern "C" {
                    llama_seq_id   seq_id_dst,
                       llama_pos   p0,
                       llama_pos   p1);
+    
+    // Notify other nodes to copy a range of KV entries 
+    LLAMA_API void llama_send_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1);

    // Removes all tokens that do not belong to the specified sequence
    LLAMA_API void llama_kv_cache_seq_keep(
@ -750,6 +767,14 @@ extern "C" {
                       llama_pos   p1,
                       llama_pos   delta);

+    // Notify other nodes to shift (add) their KV cache entries
+    LLAMA_API void llama_send_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta);
+
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
@ -762,6 +787,14 @@ extern "C" {
                       llama_pos   p0,
                       llama_pos   p1,
                             int   d);
+    
+    // Notify other nodes to perform a division operation on a KV cache range
+    LLAMA_API void llama_send_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d);

    // Returns the largest position present in the KV cache for the specified sequence
    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(