mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # ggml-cuda.cu # tests/test-tokenizer-0-falcon.py # tests/test-tokenizer-0-llama.py
This commit is contained in:
commit
6bf8ee4aea
29 changed files with 448 additions and 113 deletions
202
llama.cpp
202
llama.cpp
|
@ -92,7 +92,7 @@
|
|||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
#define LLAMA_MAX_NODES 4096
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
|
||||
//
|
||||
// logging
|
||||
|
@ -256,6 +256,8 @@ enum llm_kv {
|
|||
LLM_KV_TOKENIZER_UNK_ID,
|
||||
LLM_KV_TOKENIZER_SEP_ID,
|
||||
LLM_KV_TOKENIZER_PAD_ID,
|
||||
LLM_KV_TOKENIZER_ADD_BOS,
|
||||
LLM_KV_TOKENIZER_ADD_EOS,
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
};
|
||||
|
@ -304,6 +306,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
};
|
||||
|
@ -601,6 +605,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|||
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
replace_all(val, "\\", "\\\\");
|
||||
replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// ggml helpers
|
||||
//
|
||||
|
@ -1088,9 +1146,9 @@ enum e_model {
|
|||
MODEL_70B,
|
||||
};
|
||||
|
||||
static const size_t kB = 1024;
|
||||
static const size_t MB = 1024*kB;
|
||||
static const size_t GB = 1024*MB;
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
|
@ -1281,6 +1339,9 @@ struct llama_vocab {
|
|||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
|
||||
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = 32007;
|
||||
id special_middle_id = 32009;
|
||||
|
@ -1330,6 +1391,9 @@ struct llama_model {
|
|||
|
||||
int n_gpu_layers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
// context
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
|
@ -1491,7 +1555,7 @@ static bool llama_kv_cache_init(
|
|||
vram_kv_cache += ggml_nbytes(cache.k);
|
||||
}
|
||||
if (vram_kv_cache > 0) {
|
||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1789,10 +1853,10 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
ftype = LLAMA_FTYPE_ALL_F32;
|
||||
} break;
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
ftype = LLAMA_FTYPE_ALL_F32;
|
||||
} break;
|
||||
}
|
||||
|
||||
// this is a way to mark that we have "guessed" the file type
|
||||
|
@ -1806,10 +1870,20 @@ struct llama_model_loader {
|
|||
}
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
|
||||
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
|
@ -2104,6 +2178,17 @@ static void llm_load_hparams(
|
|||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
if (type == GGUF_TYPE_ARRAY) {
|
||||
continue;
|
||||
}
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const std::string value = gguf_kv_to_str(ctx, i);
|
||||
model.gguf_kv.emplace(name, value);
|
||||
}
|
||||
|
||||
// get general kv
|
||||
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
||||
|
||||
|
@ -2418,6 +2503,23 @@ static void llm_load_vocab(
|
|||
__func__, key.c_str(), id, old_id);
|
||||
id = old_id;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Handle add_bos_token and add_eos_token
|
||||
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
||||
int kid = gguf_find_key(ctx, key.c_str());
|
||||
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
||||
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
||||
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
||||
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
||||
}
|
||||
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
||||
kid = gguf_find_key(ctx, key.c_str());
|
||||
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
||||
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
||||
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
||||
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2549,8 +2651,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
if (ml.n_bytes < GB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
if (ml.n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
}
|
||||
|
@ -2588,7 +2690,7 @@ static void llm_load_tensors(
|
|||
|
||||
ml.calc_sizes(ctx_size, mmapped_size);
|
||||
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
|
@ -3237,7 +3339,7 @@ static void llm_load_tensors(
|
|||
ctx_size +
|
||||
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||
|
||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
@ -3256,7 +3358,7 @@ static void llm_load_tensors(
|
|||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
|
@ -6534,7 +6636,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
||||
// and passing 'add space prefix' as bool argument
|
||||
//
|
||||
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
if (&fragment == &fragment_buffer.front()) {
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||
|
@ -8198,7 +8303,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
workers.clear();
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
int64_t tot_count = 0;
|
||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||
hist_all[i] += hist_cur[i];
|
||||
|
@ -8741,7 +8846,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// resized during inference
|
||||
|
@ -8786,7 +8891,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
// measure memory requirements for the graph
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||
|
||||
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
|
||||
// recreate allocator with exact memory requirements
|
||||
ggml_allocr_free(ctx->alloc);
|
||||
|
@ -8800,7 +8905,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
#endif
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
ggml_cuda_set_scratch_size(alloc_size);
|
||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||
|
||||
// calculate total VRAM usage
|
||||
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
||||
|
@ -8820,10 +8925,10 @@ struct llama_context * llama_new_context_with_model(
|
|||
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
||||
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
||||
|
||||
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
||||
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
||||
total_vram_size / 1024.0 / 1024.0,
|
||||
model_vram_size / 1024.0 / 1024.0,
|
||||
ctx_vram_size / 1024.0 / 1024.0);
|
||||
ctx_vram_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -8844,7 +8949,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
||||
|
||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||
if (!(result)) { \
|
||||
|
@ -8910,6 +9015,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|||
return model->hparams.rope_freq_scale_train;
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||||
const auto & it = model->gguf_kv.find(key);
|
||||
if (it == model->gguf_kv.end()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_count(const struct llama_model * model) {
|
||||
return (int)model->gguf_kv.size();
|
||||
}
|
||||
|
||||
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = model->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = model->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
llama_model_arch_name(model->arch).c_str(),
|
||||
|
@ -9561,6 +9705,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|||
return model->vocab.linefeed_id;
|
||||
}
|
||||
|
||||
int llama_add_bos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_bos;
|
||||
}
|
||||
|
||||
int llama_add_eos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_eos;
|
||||
}
|
||||
|
||||
llama_token llama_token_prefix(const struct llama_model * model) {
|
||||
return model->vocab.special_prefix_id;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue