mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .dockerignore # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # flake.lock # flake.nix # tests/CMakeLists.txt
This commit is contained in:
commit
4b00916ac7
53 changed files with 4980 additions and 1910 deletions
531
llama.cpp
531
llama.cpp
|
@ -1,9 +1,6 @@
|
|||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama.h"
|
||||
|
@ -63,6 +60,9 @@
|
|||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
|
@ -115,12 +115,17 @@ static size_t utf8_len(char src) {
|
|||
}
|
||||
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
for (size_t pos = 0; ; pos += replace.length()) {
|
||||
pos = s.find(search, pos);
|
||||
if (pos == std::string::npos) break;
|
||||
s.erase(pos, search.length());
|
||||
s.insert(pos, replace);
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
|
@ -196,6 +201,7 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
|
@ -239,6 +245,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
|
@ -795,12 +802,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|||
(void) tensor;
|
||||
}
|
||||
|
||||
static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
|
||||
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
||||
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
@ -828,6 +835,7 @@ enum e_model {
|
|||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
|
@ -953,10 +961,10 @@ struct llama_vocab {
|
|||
id linefeed_id = 13;
|
||||
|
||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||
replace_all(token_left, " ", "Ġ");
|
||||
replace_all(token_left, "\n", "Ċ");
|
||||
replace_all(token_right, " ", "Ġ");
|
||||
replace_all(token_right, "\n", "Ċ");
|
||||
replace_all(token_left, " ", "\u0120");
|
||||
replace_all(token_left, "\n", "\u010A");
|
||||
replace_all(token_right, " ", "\u0120");
|
||||
replace_all(token_right, "\n", "\u010A");
|
||||
|
||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == bpe_ranks.end()) {
|
||||
|
@ -1142,11 +1150,13 @@ static bool llama_kv_cache_init(
|
|||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
};
|
||||
|
||||
static const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
|
@ -1519,6 +1529,7 @@ static const char * llama_model_type_name(e_model type) {
|
|||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
|
@ -1560,12 +1571,26 @@ static void llm_load_hparams(
|
|||
hparams.n_head_kv = hparams.n_head;
|
||||
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
||||
|
||||
// TODO: manually setting rope scale should override this
|
||||
// TODO: manually setting rope freq base and scale should override this
|
||||
// FIXME: partial fix when the param specified is not the default value, but
|
||||
// will not work for overriding the model value to the params default
|
||||
|
||||
llama_context_params defaults = llama_context_default_params();
|
||||
|
||||
// rope_freq_base
|
||||
{
|
||||
float ropebase = 10000.0f;
|
||||
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
||||
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
||||
rope_freq_base = ropebase;
|
||||
}
|
||||
}
|
||||
|
||||
// rope_freq_scale (inverse of the kv) is optional
|
||||
{
|
||||
float ropescale = 1.0f;
|
||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
if (ropescale != 1.0f) {
|
||||
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
||||
rope_freq_scale = 1.0f/ropescale;
|
||||
}
|
||||
}
|
||||
|
@ -1591,6 +1616,7 @@ static void llm_load_hparams(
|
|||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
|
@ -1617,7 +1643,8 @@ static void llm_load_hparams(
|
|||
}
|
||||
|
||||
// TODO: This should probably be in llama.h
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
static void llm_load_vocab(
|
||||
llama_model_loader & ml,
|
||||
|
@ -1719,7 +1746,11 @@ static void llm_load_vocab(
|
|||
}
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||
} else {
|
||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
||||
}
|
||||
|
||||
// special tokens
|
||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||
|
@ -1818,7 +1849,7 @@ static void llm_load_tensors(
|
|||
(void) main_gpu;
|
||||
(void) mul_mat_q;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
||||
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
||||
ggml_cuda_set_main_device(main_gpu);
|
||||
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
|
@ -1940,6 +1971,14 @@ static void llm_load_tensors(
|
|||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
@ -1949,7 +1988,7 @@ static void llm_load_tensors(
|
|||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
@ -1960,6 +1999,11 @@ static void llm_load_tensors(
|
|||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
||||
}
|
||||
}
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
|
@ -1967,6 +2011,13 @@ static void llm_load_tensors(
|
|||
|
||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
||||
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
@ -2597,18 +2648,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
|
||||
const size_t wsize = ggml_type_size(cur->type);
|
||||
|
||||
struct ggml_tensor * tmpq = ggml_view_3d(
|
||||
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
||||
// non-contiguous views is added for the rope operator
|
||||
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head, N,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
0);
|
||||
0));
|
||||
offload_func_kq(tmpq);
|
||||
|
||||
struct ggml_tensor * tmpk = ggml_view_3d(
|
||||
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head_kv, N,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
wsize * n_embd_head * n_head);
|
||||
wsize * n_embd_head * n_head));
|
||||
offload_func_kq(tmpk);
|
||||
|
||||
struct ggml_tensor * tmpv = ggml_view_3d(
|
||||
|
@ -2705,11 +2758,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
struct ggml_tensor * inpFF = attn_norm;
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
||||
|
||||
// TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
|
||||
// adding this, because there seems to be a bug in the Metal concurrency optimization
|
||||
// without this line, the results are non-deterministic and wrong
|
||||
cur->src[2] = attn_out;
|
||||
offload_func(cur);
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
@ -2798,7 +2846,6 @@ static bool llama_eval_internal(
|
|||
|
||||
GGML_ASSERT(n_tokens > 0);
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
// TODO: keep the values of n_batch and n_ctx
|
||||
// GGML_ASSERT(n_tokens <= n_batch);
|
||||
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
||||
|
@ -2809,6 +2856,8 @@ static bool llama_eval_internal(
|
|||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||
#endif
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
const int N = n_tokens;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
|
@ -2993,23 +3042,12 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|||
return vocab.token_to_id.at(buf);
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespace(const std::string& text) {
|
||||
std::string result = "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
result += "\xe2\x96\x81";
|
||||
} else {
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
static void llama_escape_whitespace(std::string & text) {
|
||||
replace_all(text, " ", "\xe2\x96\x81");
|
||||
}
|
||||
|
||||
static std::string llama_unescape_whitespace(const std::string& word) {
|
||||
if (word.length() >= 3 && word.substr(0, 3) == "\xe2\x96\x81") {
|
||||
return std::string(" ") + word.substr(3);
|
||||
}
|
||||
return word;
|
||||
static void llama_unescape_whitespace(std::string & word) {
|
||||
replace_all(word, "\xe2\x96\x81", " ");
|
||||
}
|
||||
|
||||
struct llm_symbol {
|
||||
|
@ -3189,7 +3227,7 @@ struct llm_bigram_bpe {
|
|||
};
|
||||
|
||||
struct llm_tokenizer_bpe {
|
||||
llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
|
||||
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
int final_prev_index = -1;
|
||||
|
@ -3341,8 +3379,6 @@ private:
|
|||
return words;
|
||||
}
|
||||
|
||||
bool flag_g2ws = false;
|
||||
|
||||
const llama_vocab & vocab;
|
||||
|
||||
std::vector<llm_symbol> symbols;
|
||||
|
@ -3351,9 +3387,18 @@ private:
|
|||
llm_bigram_bpe::queue work_queue;
|
||||
};
|
||||
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
|
||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
// OG tokenizer behavior:
|
||||
//
|
||||
// tokenizer.encode('', add_bos=True) returns [1]
|
||||
// tokenizer.encode('', add_bos=False) returns []
|
||||
|
||||
if (bos && vocab.special_bos_id != -1) {
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
if (raw_text.empty()) {
|
||||
return output;
|
||||
}
|
||||
|
@ -3361,29 +3406,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
switch (vocab.type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM:
|
||||
{
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
raw_text = " " + raw_text;
|
||||
|
||||
llm_tokenizer_spm tokenizer(vocab);
|
||||
|
||||
if (bos) {
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
std::string text;
|
||||
if (escape) {
|
||||
text = llama_escape_whitespace(raw_text);
|
||||
} else {
|
||||
text = raw_text;
|
||||
}
|
||||
|
||||
tokenizer.tokenize(text, output);
|
||||
llama_escape_whitespace(raw_text);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
llm_tokenizer_bpe tokenizer(vocab, escape);
|
||||
|
||||
if (bos && vocab.special_bos_id != -1) {
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
|
||||
llm_tokenizer_bpe tokenizer(vocab);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
} break;
|
||||
};
|
||||
|
@ -3878,7 +3910,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|||
|
||||
// Calculate absolute value of second derivatives
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = abs(second_derivatives[i]);
|
||||
second_derivatives[i] = std::abs(second_derivatives[i]);
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
|
@ -4069,16 +4101,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string text = llama_token_to_text(ctx, id);
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string piece = llama_token_to_str(ctx, id);
|
||||
if (id == eos) {
|
||||
if (!allow_eos) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (text.empty() || text[0] == 0) {
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
|
||||
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
|
@ -4282,10 +4314,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string text = llama_token_to_text(ctx, token);
|
||||
const std::string piece = llama_token_to_str(ctx, token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
|
||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||
|
@ -4296,6 +4328,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
//
|
||||
// Beam search
|
||||
//
|
||||
|
||||
struct llama_beam {
|
||||
std::vector<llama_token> tokens;
|
||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
||||
// Sort beams by probability. In case of ties, prefer beams at eob.
|
||||
bool operator<(const llama_beam & rhs) const {
|
||||
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
||||
}
|
||||
// Shift off first n tokens and discard them.
|
||||
void shift_tokens(const size_t n) {
|
||||
if (n) {
|
||||
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
||||
tokens.resize(tokens.size() - n);
|
||||
}
|
||||
}
|
||||
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
||||
};
|
||||
|
||||
// A struct for calculating logit-related info.
|
||||
struct llama_logit_info {
|
||||
const float * const logits;
|
||||
const int n_vocab;
|
||||
const float max_l;
|
||||
const float normalizer;
|
||||
struct sum_exp {
|
||||
float max_l;
|
||||
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||
};
|
||||
llama_logit_info(llama_context * ctx)
|
||||
: logits(llama_get_logits(ctx))
|
||||
, n_vocab(llama_n_vocab(ctx))
|
||||
, max_l(*std::max_element(logits, logits + n_vocab))
|
||||
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
||||
{ }
|
||||
llama_token_data get_token_data(const llama_token token_id) const {
|
||||
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
||||
return {token_id, logits[token_id], p};
|
||||
}
|
||||
// Return top k token_data by logit.
|
||||
std::vector<llama_token_data> top_k(size_t k) {
|
||||
std::vector<llama_token_data> min_heap; // min-heap by logit
|
||||
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
||||
min_heap.reserve(k_min);
|
||||
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
||||
min_heap.push_back(get_token_data(token_id));
|
||||
}
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
||||
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
||||
if (min_heap.front().logit < logits[token_id]) {
|
||||
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
min_heap.back().id = token_id;
|
||||
min_heap.back().logit = logits[token_id];
|
||||
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
}
|
||||
}
|
||||
return min_heap;
|
||||
}
|
||||
float probability_from_logit(float logit) {
|
||||
return normalizer * std::exp(logit - max_l);
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_beam_search_data {
|
||||
llama_context * ctx;
|
||||
size_t n_beams;
|
||||
int n_past;
|
||||
int n_predict;
|
||||
int n_threads;
|
||||
std::vector<llama_beam> beams;
|
||||
std::vector<llama_beam> next_beams;
|
||||
|
||||
// Re-calculated on each loop iteration
|
||||
size_t common_prefix_length;
|
||||
|
||||
// Used to communicate to/from callback on beams state.
|
||||
std::vector<llama_beam_view> beam_views;
|
||||
|
||||
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
||||
: ctx(ctx)
|
||||
, n_beams(n_beams)
|
||||
, n_past(n_past)
|
||||
, n_predict(n_predict)
|
||||
, n_threads(n_threads)
|
||||
, beam_views(n_beams) {
|
||||
beams.reserve(n_beams);
|
||||
next_beams.reserve(n_beams);
|
||||
}
|
||||
|
||||
// Collapse beams to a single beam given by index.
|
||||
void collapse_beams(const size_t beam_idx) {
|
||||
if (0u < beam_idx) {
|
||||
std::swap(beams[0], beams[beam_idx]);
|
||||
}
|
||||
beams.resize(1);
|
||||
}
|
||||
|
||||
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||
// The repetative patterns below reflect the 2 stages of heaps:
|
||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||
// * If the heap is full and a new element is found that should be included, pop the
|
||||
// least element to the back(), replace it with the new, then push it into the heap.
|
||||
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
||||
// Min-heaps use a greater-than comparator.
|
||||
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
||||
if (beam.eob) {
|
||||
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
||||
if (next_beams.size() < n_beams) {
|
||||
next_beams.push_back(std::move(beam));
|
||||
if (next_beams.size() == n_beams) {
|
||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
} else if (next_beams.front().p < beam.p) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = std::move(beam);
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
} else {
|
||||
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
||||
if (!beam.tokens.empty()) {
|
||||
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
||||
}
|
||||
llama_logit_info logit_info(ctx);
|
||||
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
||||
size_t i=0;
|
||||
if (next_beams.size() < n_beams) {
|
||||
for (; next_beams.size() < n_beams ; ++i) {
|
||||
llama_beam next_beam = beam;
|
||||
next_beam.tokens.push_back(next_tokens[i].id);
|
||||
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
next_beams.push_back(std::move(next_beam));
|
||||
}
|
||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
} else {
|
||||
for (; next_beams.front().p == 0.0f ; ++i) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = beam;
|
||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
}
|
||||
for (; i < n_beams ; ++i) {
|
||||
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
if (next_beams.front().p < next_p) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = beam;
|
||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||
next_beams.back().p = next_p;
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find common_prefix_length based on beams.
|
||||
// Requires beams is not empty.
|
||||
size_t find_common_prefix_length() {
|
||||
size_t common_prefix_length = beams[0].tokens.size();
|
||||
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
||||
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
||||
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
||||
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
||||
common_prefix_length = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return common_prefix_length;
|
||||
}
|
||||
|
||||
// Construct beams_state to send back to caller via the callback function.
|
||||
// Side effect: set common_prefix_length = find_common_prefix_length();
|
||||
llama_beams_state get_beams_state(const bool last_call) {
|
||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||
beam_views[i] = beams[i].view();
|
||||
}
|
||||
common_prefix_length = find_common_prefix_length();
|
||||
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
||||
}
|
||||
|
||||
// Loop:
|
||||
// * while i < n_predict, AND
|
||||
// * any of the beams have not yet reached end-of-beam (eob), AND
|
||||
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
||||
// (since all other beam probabilities can only decrease)
|
||||
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
||||
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
||||
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
||||
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
||||
!beams[top_beam_index()].eob ; ++i) {
|
||||
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
||||
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
||||
if (common_prefix_length) {
|
||||
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
||||
n_past += common_prefix_length;
|
||||
}
|
||||
// Zero-out next_beam probabilities to place them last in following min-heap.
|
||||
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
||||
for (llama_beam & beam : beams) {
|
||||
beam.shift_tokens(common_prefix_length);
|
||||
fill_next_beams_by_top_probabilities(beam);
|
||||
}
|
||||
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
||||
beams.swap(next_beams);
|
||||
renormalize_beam_probabilities(beams);
|
||||
}
|
||||
collapse_beams(top_beam_index());
|
||||
callback(callback_data, get_beams_state(true));
|
||||
}
|
||||
|
||||
// As beams grow, the cumulative probabilities decrease.
|
||||
// Renormalize them to avoid floating point underflow.
|
||||
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
||||
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
||||
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
||||
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
||||
}
|
||||
|
||||
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
||||
size_t top_beam_index() {
|
||||
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
||||
}
|
||||
|
||||
// Copy (p,eob) for each beam which may have been changed by the callback.
|
||||
void update_beams_from_beam_views() {
|
||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||
beams[i].p = beam_views[i].p;
|
||||
beams[i].eob = beam_views[i].eob;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void llama_beam_search(llama_context * ctx,
|
||||
llama_beam_search_callback_fn_t callback, void * callback_data,
|
||||
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
||||
assert(ctx);
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
||||
|
||||
beam_search_data.loop(callback, callback_data);
|
||||
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
ctx->n_sample++;
|
||||
}
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
@ -4393,6 +4676,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch(*ml, model);
|
||||
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||
|
||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
|
@ -4418,6 +4705,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
++n_feed_forward_w2;
|
||||
}
|
||||
}
|
||||
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
||||
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
||||
}
|
||||
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
|
@ -4494,8 +4785,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
||||
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
|
@ -4509,21 +4802,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
if (model.type == MODEL_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
++i_attention_wv;
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||
: GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
if (model.arch == LLM_ARCH_FALCON) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
} else {
|
||||
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
if (model.arch != LLM_ARCH_FALCON) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
}
|
||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
|
@ -4538,8 +4859,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||
convert_incompatible_tensor = true;
|
||||
}
|
||||
}
|
||||
|
@ -5267,13 +5588,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
model->name.c_str(),
|
||||
llama_model_type_name(model->type),
|
||||
llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
|
||||
uint64_t llama_model_size(const struct llama_model * model) {
|
||||
uint64_t size = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
size += ggml_nbytes(it.second);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||
uint64_t nparams = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
nparams += ggml_nelements(it.second);
|
||||
}
|
||||
return nparams;
|
||||
}
|
||||
|
||||
int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
|
@ -5798,8 +6135,7 @@ int llama_tokenize_with_model(
|
|||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos) {
|
||||
auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
|
||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||
|
||||
if (n_max_tokens < (int) res.size()) {
|
||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||
|
@ -5813,17 +6149,17 @@ int llama_tokenize_with_model(
|
|||
return res.size();
|
||||
}
|
||||
|
||||
int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||
return llama_token_to_str_with_model(&ctx->model, token, buf, length);
|
||||
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
||||
}
|
||||
|
||||
// does not write null-terminator to str
|
||||
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||
// does not write null-terminator to buf
|
||||
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||
if (0 <= token && token < llama_model_n_vocab(model)) {
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
||||
result = llama_unescape_whitespace(result);
|
||||
llama_unescape_whitespace(result);
|
||||
}
|
||||
if (length < (int) result.length()) {
|
||||
return -result.length();
|
||||
|
@ -5906,6 +6242,7 @@ const char * llama_print_system_info(void) {
|
|||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue