mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 23:54:34 +00:00
fix context shifting
This commit is contained in:
parent
07c4966a80
commit
c54a6a0132
8 changed files with 397 additions and 73 deletions
|
@ -283,7 +283,7 @@ extern "C" {
|
|||
uint32_t n_world; // number of nodes
|
||||
uint32_t rank; // my node rank
|
||||
uint32_t n_layer_window[32]; // number of layers to kept each time
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
|
@ -707,8 +707,10 @@ extern "C" {
|
|||
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_cache_clear(
|
||||
struct llama_context * ctx);
|
||||
LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx);
|
||||
|
||||
// Notify other devices to clear their KV cache
|
||||
LLAMA_API void llama_send_kv_cache_clear(struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||
|
@ -720,6 +722,13 @@ extern "C" {
|
|||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Notify other nodes to remove a range from their KV cache
|
||||
LLAMA_API void llama_send_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Copy all tokens that belong to the specified sequence to another sequence
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
|
@ -731,6 +740,14 @@ extern "C" {
|
|||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Notify other nodes to copy a range of KV entries
|
||||
LLAMA_API void llama_send_kv_cache_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Removes all tokens that do not belong to the specified sequence
|
||||
LLAMA_API void llama_kv_cache_seq_keep(
|
||||
|
@ -750,6 +767,14 @@ extern "C" {
|
|||
llama_pos p1,
|
||||
llama_pos delta);
|
||||
|
||||
// Notify other nodes to shift (add) their KV cache entries
|
||||
LLAMA_API void llama_send_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
|
@ -762,6 +787,14 @@ extern "C" {
|
|||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Notify other nodes to perform a division operation on a KV cache range
|
||||
LLAMA_API void llama_send_kv_cache_seq_div(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue