Merge branch 'master' into concedo_experimental

# Conflicts:
#	.devops/nix/sif.nix
#	.github/workflows/build.yml
#	.github/workflows/python-check-requirements.yml
#	README-sycl.md
#	README.md
#	flake.lock
#	flake.nix
#	requirements/requirements-convert-hf-to-gguf.txt
#	scripts/compare-llama-bench.py
This commit is contained in:
Concedo 2024-03-04 15:33:33 +08:00
commit 7c64845dea
41 changed files with 3325 additions and 2053 deletions

484
llama.cpp
View file

@ -109,6 +109,7 @@
#define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8
//
// logging
//
@ -235,10 +236,11 @@ enum llm_arch {
LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM,
LLM_ARCH_GEMMA,
LLM_ARCH_STARCODER2,
LLM_ARCH_UNKNOWN,
};
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GPT2, "gpt2" },
@ -262,6 +264,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
enum llm_kv {
@ -322,7 +326,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_RWKV,
};
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@ -386,7 +390,7 @@ struct LLM_KV {
llm_arch arch;
std::string operator()(llm_kv kv) const {
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
}
};
@ -421,7 +425,7 @@ enum llm_tensor {
LLM_TENSOR_LAYER_OUT_NORM,
};
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
{
LLM_ARCH_LLAMA,
{
@ -803,6 +807,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_STARCODER2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_UNKNOWN,
{
@ -836,38 +858,38 @@ struct LLM_TN {
llm_arch arch;
std::string operator()(llm_tensor tensor) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
return LLM_TENSOR_NAMES[arch].at(tensor);
return LLM_TENSOR_NAMES.at(arch).at(tensor);
}
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
}
std::string operator()(llm_tensor tensor, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
}
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
return "__missing__";
}
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
}
};
@ -875,16 +897,16 @@ struct LLM_TN {
// gguf helpers
//
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
};
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
if (kv.second == name) {
return kv.first;
return (llama_rope_scaling_type) kv.first;
}
}
@ -1437,7 +1459,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
buft = ggml_backend_cuda_host_buffer_type();
}
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_host_buffer_type();
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
@ -1491,6 +1515,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu);
}
@ -1502,6 +1532,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
@ -1515,6 +1547,11 @@ static size_t llama_get_device_memory(int device) {
size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
@ -1603,7 +1640,6 @@ struct llama_hparams {
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
int32_t rope_scaling_type_train;
float f_clamp_kqv = 0.0f;
float f_max_alibi_bias = 0.0f;
@ -1611,8 +1647,9 @@ struct llama_hparams {
bool causal_attn = true;
bool need_kq_pos = false;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
bool operator!=(const llama_hparams & other) const {
if (this->vocab_only != other.vocab_only) return true;
@ -1661,8 +1698,8 @@ struct llama_cparams {
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
float rope_freq_base;
float rope_freq_scale;
uint32_t n_yarn_orig_ctx;
// These hyperparameters are not exposed in GGUF, because all
@ -1673,9 +1710,8 @@ struct llama_cparams {
float yarn_beta_slow;
float defrag_thold;
bool mul_mat_q;
bool offload_kqv;
bool do_pooling;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data;
@ -1983,6 +2019,9 @@ struct llama_context {
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
// input tensors
ggml_backend_buffer_t buf_input = nullptr;
ggml_context * ctx_input = nullptr;
@ -2149,10 +2188,12 @@ static bool llama_kv_cache_find_slot(
}
// find how many cells are currently in use
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
for (uint32_t i = cache.size - 1; i > 0; --i) {
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
return i + 1;
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
for (uint32_t i = cache.size; i > 0; --i) {
const llama_kv_cell & cell = cache.cells[i - 1];
if (cell.pos >= 0 && !cell.is_empty()) {
return i;
}
}
@ -2938,7 +2979,11 @@ template<>
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
uint32_t tmp;
const bool found = get_key(kid, tmp, required);
result = (enum llama_pooling_type) tmp;
if (found) {
result = (enum llama_pooling_type) tmp;
} else {
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
}
return found;
}
@ -3215,7 +3260,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 3:
@ -3367,6 +3412,16 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_STARCODER2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 30: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_15B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0;
}
@ -4563,6 +4618,56 @@ static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
}
} break;
case LLM_ARCH_STARCODER2:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
}
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
// optional bias tensors
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
}
} break;
default:
throw std::runtime_error("unknown architecture");
}
@ -5146,7 +5251,7 @@ struct llm_build_context {
n_kv (worst_case ? n_ctx : kv_self.n),
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
n_orig_ctx (cparams.n_yarn_orig_ctx),
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
pooling_type (cparams.pooling_type),
rope_type (hparams.rope_type),
cb (cb),
buf_compute_meta (lctx.buf_compute_meta) {
@ -7632,6 +7737,120 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
LLM_NORM, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
LLM_NORM, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, model.output_norm_b,
LLM_NORM, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -7778,6 +7997,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_gemma();
} break;
case LLM_ARCH_STARCODER2:
{
result = llm.build_starcoder2();
} break;
default:
GGML_ASSERT(false);
}
@ -7868,7 +8091,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
@ -7896,7 +8119,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
@ -7929,6 +8152,7 @@ static void llama_graph_compute(
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
ggml_backend_sched_graph_compute(lctx.sched, gf);
@ -8032,7 +8256,7 @@ static int llama_decode_internal(
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
//kv_self.n = llama_kv_cache_cell_max(kv_self);
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
@ -8059,9 +8283,6 @@ static int llama_decode_internal(
GGML_ASSERT(false);
}
#if defined(GGML_USE_CUBLAS)
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
#endif
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
@ -9243,10 +9464,10 @@ struct llm_tokenizer_wpm {
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
std::vector<uint32_t> nfd_codepoints;
for (uint32_t code : codepoints) {
auto it = nfd_map.find(code);
if (it != nfd_map.end()) {
for (uint32_t c : it->second) {
nfd_codepoints.push_back(c);
auto it = nfd_map.equal_range(code);
if (it.first != it.second) {
for (auto jt = it.first; jt != it.second; jt++) {
nfd_codepoints.push_back(jt->second);
}
} else {
nfd_codepoints.push_back(code);
@ -9297,12 +9518,13 @@ struct llm_tokenizer_wpm {
}
uint32_t to_lower(uint32_t code) {
static const std::locale locale("en_US.UTF-8");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
}
#endif
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
return std::tolower(wchar_t(code), locale);
}
bool is_ascii_punct(uint32_t code) {
@ -10941,7 +11163,7 @@ struct quantize_state_internal {
{}
};
static void llama_convert_tensor_internal(
static void llama_tensor_dequantize_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
@ -11282,6 +11504,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return new_type;
}
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex;
int counter = 0;
size_t new_size = 0;
if (nthread < 2) {
// single-thread
return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
}
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
}
};
for (int it = 0; it < nthread - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
return new_size;
}
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type quantized_type;
llama_ftype ftype = params->ftype;
@ -11394,7 +11656,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers;
workers.reserve(nthread);
std::mutex mutex;
int idx = 0;
@ -11508,7 +11769,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
@ -11529,41 +11790,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
} else {
int counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
nrows, n_per_row, imatrix]() {
std::array<int64_t, 1 << 4> local_hist = {};
const int nrows_per_chunk = chunk_size / n_per_row;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int first_row = counter; counter += nrows_per_chunk;
if (first_row >= nrows) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
new_size += local_size;
}
break;
}
lock.unlock();
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
}
};
for (int it = 0; it < nthread_use - 1; ++it) {
workers.emplace_back(compute);
}
compute();
for (auto & w : workers) { w.join(); }
workers.clear();
}
new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0;
@ -11940,6 +12167,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.yarn_ext_factor =*/ -1.0f,
@ -11952,11 +12180,11 @@ struct llama_context_params llama_context_default_params() {
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
/*.mul_mat_q =*/ true,
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.offload_kqv =*/ true,
/*.do_pooling =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
return result;
@ -12094,9 +12322,8 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.mul_mat_q = params.mul_mat_q;
cparams.offload_kqv = params.offload_kqv;
cparams.do_pooling = params.do_pooling;
cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@ -12122,6 +12349,14 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
}
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
} else {
cparams.pooling_type = hparams.pooling_type;
}
}
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
@ -12130,8 +12365,11 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
const ggml_type type_k = params.type_k;
const ggml_type type_v = params.type_v;
@ -12189,13 +12427,31 @@ struct llama_context * llama_new_context_with_model(
}
#elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
@ -12275,7 +12531,6 @@ struct llama_context * llama_new_context_with_model(
ggml_set_name(ctx->inp_cls, "inp_cls");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_input),
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
@ -12396,6 +12651,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN2:
case LLM_ARCH_PHI2:
case LLM_ARCH_GEMMA:
case LLM_ARCH_STARCODER2:
return LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here
@ -12680,9 +12936,14 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
const size_t s_embedding_size = sizeof(size_t);
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_kv_size = sizeof(size_t);
const size_t s_kv_ntok = sizeof(int);
const size_t s_kv_buf_size = sizeof(size_t);
const size_t s_kv_head = sizeof(uint32_t);
const size_t s_kv_size = sizeof(uint32_t);
const size_t s_kv_used = sizeof(uint32_t);
const size_t s_kv = ctx->kv_self.total_size();
// TODO: assume the max is more than 1 seq_id per KV cell
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
const size_t s_total = (
+ s_rng_size
@ -12691,9 +12952,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
+ s_logits
+ s_embedding_size
+ s_embedding
+ s_kv_buf_size
+ s_kv_head
+ s_kv_size
+ s_kv_ntok
+ s_kv_used
+ s_kv
+ s_kv_cells
);
return s_total;
@ -12793,15 +13057,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
{
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t n_ctx = cparams.n_ctx;
const size_t kv_buf_size = kv_self.total_size();
const uint32_t kv_head = kv_self.head;
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
const uint32_t kv_size = kv_self.size;
const uint32_t kv_used = kv_self.used;
@ -12821,7 +13083,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
// v is not contiguous, copy row by row
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
tmp_buf.resize(v_row_size);
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
@ -12831,7 +13093,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
}
}
for (uint32_t i = 0; i < kv_size; ++i) {
for (uint32_t i = 0; i < kv_head; ++i) {
const auto & cell = kv_self.cells[i];
const llama_pos pos = cell.pos;
@ -12907,12 +13169,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
{
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const auto & cparams = ctx->cparams;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
const uint32_t n_ctx = cparams.n_ctx;
size_t kv_buf_size;
uint32_t kv_head;
@ -12935,7 +13195,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
// v is not contiguous, copy row by row
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
@ -12944,13 +13204,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
}
}
GGML_ASSERT(kv_self.size == kv_size);
ctx->kv_self.head = kv_head;
ctx->kv_self.size = kv_size;
ctx->kv_self.used = kv_used;
ctx->kv_self.cells.resize(kv_size);
for (uint32_t i = 0; i < kv_size; ++i) {
for (uint32_t i = 0; i < kv_head; ++i) {
llama_pos pos;
size_t seq_id_size;
@ -12966,6 +13228,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
ctx->kv_self.cells[i].seq_id.insert(seq_id);
}
}
for (uint32_t i = kv_head; i < kv_size; ++i) {
ctx->kv_self.cells[i].pos = -1;
ctx->kv_self.cells[i].seq_id.clear();
}
}
const size_t nread = inp - src;
@ -13073,6 +13340,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
ctx->cparams.n_threads_batch = n_threads_batch;
}
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,