mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .devops/nix/sif.nix # .github/workflows/build.yml # .github/workflows/python-check-requirements.yml # README-sycl.md # README.md # flake.lock # flake.nix # requirements/requirements-convert-hf-to-gguf.txt # scripts/compare-llama-bench.py
This commit is contained in:
commit
7c64845dea
41 changed files with 3325 additions and 2053 deletions
484
llama.cpp
484
llama.cpp
|
@ -109,6 +109,7 @@
|
|||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_EXPERTS 8
|
||||
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
@ -235,10 +236,11 @@ enum llm_arch {
|
|||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_MINICPM,
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
|
@ -262,6 +264,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
|
@ -322,7 +326,7 @@ enum llm_kv {
|
|||
LLM_KV_TOKENIZER_RWKV,
|
||||
};
|
||||
|
||||
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
|
@ -386,7 +390,7 @@ struct LLM_KV {
|
|||
llm_arch arch;
|
||||
|
||||
std::string operator()(llm_kv kv) const {
|
||||
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
||||
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -421,7 +425,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_LAYER_OUT_NORM,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
{
|
||||
LLM_ARCH_LLAMA,
|
||||
{
|
||||
|
@ -803,6 +807,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
@ -836,38 +858,38 @@ struct LLM_TN {
|
|||
llm_arch arch;
|
||||
|
||||
std::string operator()(llm_tensor tensor) const {
|
||||
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
return LLM_TENSOR_NAMES[arch].at(tensor);
|
||||
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
||||
}
|
||||
|
||||
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
||||
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
||||
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
||||
}
|
||||
|
||||
std::string operator()(llm_tensor tensor, int bid) const {
|
||||
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
|
||||
}
|
||||
|
||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
||||
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
|
||||
}
|
||||
|
||||
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
|
||||
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
|
||||
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
||||
return "__missing__";
|
||||
}
|
||||
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -875,16 +897,16 @@ struct LLM_TN {
|
|||
// gguf helpers
|
||||
//
|
||||
|
||||
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
||||
};
|
||||
|
||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
return (llama_rope_scaling_type) kv.first;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1437,7 +1459,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|||
buft = ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
|
@ -1491,6 +1515,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_offload(fallback_gpu);
|
||||
}
|
||||
|
@ -1502,6 +1532,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|||
static size_t llama_get_device_count() {
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
|
@ -1515,6 +1547,11 @@ static size_t llama_get_device_memory(int device) {
|
|||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
||||
return free;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &total, &free);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
|
@ -1603,7 +1640,6 @@ struct llama_hparams {
|
|||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
int32_t rope_scaling_type_train;
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
|
@ -1611,8 +1647,9 @@ struct llama_hparams {
|
|||
bool causal_attn = true;
|
||||
bool need_kq_pos = false;
|
||||
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
|
@ -1661,8 +1698,8 @@ struct llama_cparams {
|
|||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
// These hyperparameters are not exposed in GGUF, because all
|
||||
|
@ -1673,9 +1710,8 @@ struct llama_cparams {
|
|||
float yarn_beta_slow;
|
||||
float defrag_thold;
|
||||
|
||||
bool mul_mat_q;
|
||||
bool offload_kqv;
|
||||
bool do_pooling;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
@ -1983,6 +2019,9 @@ struct llama_context {
|
|||
std::vector<uint8_t> buf_compute_meta;
|
||||
ggml_backend_sched_t sched = nullptr;
|
||||
|
||||
ggml_abort_callback abort_callback = nullptr;
|
||||
void * abort_callback_data = nullptr;
|
||||
|
||||
// input tensors
|
||||
ggml_backend_buffer_t buf_input = nullptr;
|
||||
ggml_context * ctx_input = nullptr;
|
||||
|
@ -2149,10 +2188,12 @@ static bool llama_kv_cache_find_slot(
|
|||
}
|
||||
|
||||
// find how many cells are currently in use
|
||||
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
||||
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
|
||||
return i + 1;
|
||||
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||
for (uint32_t i = cache.size; i > 0; --i) {
|
||||
const llama_kv_cell & cell = cache.cells[i - 1];
|
||||
|
||||
if (cell.pos >= 0 && !cell.is_empty()) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2938,7 +2979,11 @@ template<>
|
|||
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
||||
uint32_t tmp;
|
||||
const bool found = get_key(kid, tmp, required);
|
||||
result = (enum llama_pooling_type) tmp;
|
||||
if (found) {
|
||||
result = (enum llama_pooling_type) tmp;
|
||||
} else {
|
||||
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
|
@ -3215,7 +3260,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 3:
|
||||
|
@ -3367,6 +3412,16 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 30: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_15B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
|
@ -4563,6 +4618,56 @@ static bool llm_load_tensors(
|
|||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (model.output == NULL) {
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
// optional bias tensors
|
||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
|
||||
// optional bias tensors
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
@ -5146,7 +5251,7 @@ struct llm_build_context {
|
|||
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
||||
pooling_type (cparams.pooling_type),
|
||||
rope_type (hparams.rope_type),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
|
@ -7632,6 +7737,120 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
|
@ -7778,6 +7997,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_gemma();
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
result = llm.build_starcoder2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
@ -7868,7 +8091,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||
|
@ -7896,7 +8119,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||
|
@ -7929,6 +8152,7 @@ static void llama_graph_compute(
|
|||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
}
|
||||
|
||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||
|
@ -8032,7 +8256,7 @@ static int llama_decode_internal(
|
|||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||
|
@ -8059,9 +8283,6 @@ static int llama_decode_internal(
|
|||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
||||
#endif
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
|
@ -9243,10 +9464,10 @@ struct llm_tokenizer_wpm {
|
|||
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
||||
std::vector<uint32_t> nfd_codepoints;
|
||||
for (uint32_t code : codepoints) {
|
||||
auto it = nfd_map.find(code);
|
||||
if (it != nfd_map.end()) {
|
||||
for (uint32_t c : it->second) {
|
||||
nfd_codepoints.push_back(c);
|
||||
auto it = nfd_map.equal_range(code);
|
||||
if (it.first != it.second) {
|
||||
for (auto jt = it.first; jt != it.second; jt++) {
|
||||
nfd_codepoints.push_back(jt->second);
|
||||
}
|
||||
} else {
|
||||
nfd_codepoints.push_back(code);
|
||||
|
@ -9297,12 +9518,13 @@ struct llm_tokenizer_wpm {
|
|||
}
|
||||
|
||||
uint32_t to_lower(uint32_t code) {
|
||||
static const std::locale locale("en_US.UTF-8");
|
||||
#if defined(_WIN32)
|
||||
if (code > 0xFFFF) {
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
|
||||
return std::tolower(wchar_t(code), locale);
|
||||
}
|
||||
|
||||
bool is_ascii_punct(uint32_t code) {
|
||||
|
@ -10941,7 +11163,7 @@ struct quantize_state_internal {
|
|||
{}
|
||||
};
|
||||
|
||||
static void llama_convert_tensor_internal(
|
||||
static void llama_tensor_dequantize_internal(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
) {
|
||||
|
@ -11282,6 +11504,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
return new_type;
|
||||
}
|
||||
|
||||
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
size_t new_size = 0;
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
|
||||
}
|
||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||
nrows, n_per_row, imatrix]() {
|
||||
std::array<int64_t, 1 << 4> local_hist = {};
|
||||
const int nrows_per_chunk = chunk_size / n_per_row;
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int first_row = counter; counter += nrows_per_chunk;
|
||||
if (first_row >= nrows) {
|
||||
if (local_size > 0) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||
hist_cur[j] += local_hist[j];
|
||||
}
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
||||
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
||||
}
|
||||
};
|
||||
for (int it = 0; it < nthread - 1; ++it) {
|
||||
workers.emplace_back(compute);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) { w.join(); }
|
||||
workers.clear();
|
||||
return new_size;
|
||||
}
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type quantized_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
|
@ -11394,7 +11656,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
std::vector<std::thread> workers;
|
||||
workers.reserve(nthread);
|
||||
std::mutex mutex;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
|
@ -11508,7 +11769,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
|
@ -11529,41 +11790,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||
if (nthread_use < 2) {
|
||||
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
||||
} else {
|
||||
int counter = 0;
|
||||
new_size = 0;
|
||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||
nrows, n_per_row, imatrix]() {
|
||||
std::array<int64_t, 1 << 4> local_hist = {};
|
||||
const int nrows_per_chunk = chunk_size / n_per_row;
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int first_row = counter; counter += nrows_per_chunk;
|
||||
if (first_row >= nrows) {
|
||||
if (local_size > 0) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||
hist_cur[j] += local_hist[j];
|
||||
}
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
||||
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
||||
}
|
||||
};
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers.emplace_back(compute);
|
||||
}
|
||||
compute();
|
||||
for (auto & w : workers) { w.join(); }
|
||||
workers.clear();
|
||||
}
|
||||
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
|
||||
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
int64_t tot_count = 0;
|
||||
|
@ -11940,6 +12167,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
|
@ -11952,11 +12180,11 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.cb_eval_user_data =*/ nullptr,
|
||||
/*.type_k =*/ GGML_TYPE_F16,
|
||||
/*.type_v =*/ GGML_TYPE_F16,
|
||||
/*.mul_mat_q =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.do_pooling =*/ true,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
@ -12094,9 +12322,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.do_pooling = params.do_pooling;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
|
@ -12122,6 +12349,14 @@ struct llama_context * llama_new_context_with_model(
|
|||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
} else {
|
||||
cparams.pooling_type = hparams.pooling_type;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
@ -12130,8 +12365,11 @@ struct llama_context * llama_new_context_with_model(
|
|||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
|
||||
ctx->rng = std::mt19937(params.seed);
|
||||
ctx->logits_all = params.logits_all;
|
||||
ctx->abort_callback = params.abort_callback;
|
||||
ctx->abort_callback_data = params.abort_callback_data;
|
||||
|
||||
ctx->rng = std::mt19937(params.seed);
|
||||
ctx->logits_all = params.logits_all;
|
||||
|
||||
const ggml_type type_k = params.type_k;
|
||||
const ggml_type type_v = params.type_v;
|
||||
|
@ -12189,13 +12427,31 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||
int device_id = id_list[i];
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
|
@ -12275,7 +12531,6 @@ struct llama_context * llama_new_context_with_model(
|
|||
ggml_set_name(ctx->inp_cls, "inp_cls");
|
||||
|
||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||
|
||||
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(ctx->buf_input),
|
||||
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||
|
@ -12396,6 +12651,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
|
@ -12680,9 +12936,14 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|||
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
||||
const size_t s_embedding_size = sizeof(size_t);
|
||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||
const size_t s_kv_size = sizeof(size_t);
|
||||
const size_t s_kv_ntok = sizeof(int);
|
||||
const size_t s_kv_buf_size = sizeof(size_t);
|
||||
const size_t s_kv_head = sizeof(uint32_t);
|
||||
const size_t s_kv_size = sizeof(uint32_t);
|
||||
const size_t s_kv_used = sizeof(uint32_t);
|
||||
const size_t s_kv = ctx->kv_self.total_size();
|
||||
// TODO: assume the max is more than 1 seq_id per KV cell
|
||||
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
||||
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
||||
|
||||
const size_t s_total = (
|
||||
+ s_rng_size
|
||||
|
@ -12691,9 +12952,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|||
+ s_logits
|
||||
+ s_embedding_size
|
||||
+ s_embedding
|
||||
+ s_kv_buf_size
|
||||
+ s_kv_head
|
||||
+ s_kv_size
|
||||
+ s_kv_ntok
|
||||
+ s_kv_used
|
||||
+ s_kv
|
||||
+ s_kv_cells
|
||||
);
|
||||
|
||||
return s_total;
|
||||
|
@ -12793,15 +13057,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
{
|
||||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const uint32_t n_ctx = cparams.n_ctx;
|
||||
|
||||
const size_t kv_buf_size = kv_self.total_size();
|
||||
const uint32_t kv_head = kv_self.head;
|
||||
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
||||
const uint32_t kv_size = kv_self.size;
|
||||
const uint32_t kv_used = kv_self.used;
|
||||
|
||||
|
@ -12821,7 +13083,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
|
||||
// v is not contiguous, copy row by row
|
||||
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
||||
|
||||
tmp_buf.resize(v_row_size);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
|
@ -12831,7 +13093,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
||||
for (uint32_t i = 0; i < kv_head; ++i) {
|
||||
const auto & cell = kv_self.cells[i];
|
||||
|
||||
const llama_pos pos = cell.pos;
|
||||
|
@ -12907,12 +13169,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
{
|
||||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const uint32_t n_ctx = cparams.n_ctx;
|
||||
|
||||
size_t kv_buf_size;
|
||||
uint32_t kv_head;
|
||||
|
@ -12935,7 +13195,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
|
||||
// v is not contiguous, copy row by row
|
||||
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
||||
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
||||
|
@ -12944,13 +13204,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(kv_self.size == kv_size);
|
||||
|
||||
ctx->kv_self.head = kv_head;
|
||||
ctx->kv_self.size = kv_size;
|
||||
ctx->kv_self.used = kv_used;
|
||||
|
||||
ctx->kv_self.cells.resize(kv_size);
|
||||
|
||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
||||
for (uint32_t i = 0; i < kv_head; ++i) {
|
||||
llama_pos pos;
|
||||
size_t seq_id_size;
|
||||
|
||||
|
@ -12966,6 +13228,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
||||
ctx->kv_self.cells[i].pos = -1;
|
||||
ctx->kv_self.cells[i].seq_id.clear();
|
||||
}
|
||||
}
|
||||
|
||||
const size_t nread = inp - src;
|
||||
|
@ -13073,6 +13340,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|||
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||
}
|
||||
|
||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
}
|
||||
|
||||
struct llama_batch llama_batch_get_one(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue