broken commit

This commit is contained in:
Concedo 2025-01-16 21:41:18 +08:00
commit 2a00ee8fa8
36 changed files with 5868 additions and 5479 deletions

View file

@ -47,8 +47,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
@ -239,11 +243,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);

View file

@ -384,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@ -456,7 +456,7 @@ struct llava_embd_batch {
};
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
int n_eval = image_embed->n_image_pos - i;

View file

@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
ctx_params.n_ctx = params->n_ctx;
}
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@ -167,8 +167,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);

View file

@ -27,7 +27,7 @@
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
@ -132,8 +132,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past, int * st_pos_id) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
@ -328,11 +332,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@ -481,7 +484,7 @@ static void debug_test_mrope_2d() {
}
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4;
float vals[56 * 56 * 3];
// float embd[ne];

View file

@ -6,7 +6,6 @@
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <ctime>
@ -164,6 +163,8 @@ int main(int argc, char ** argv) {
return 1;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
@ -197,7 +198,7 @@ int main(int argc, char ** argv) {
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
if (n_ctx > n_ctx_train) {
@ -242,9 +243,9 @@ int main(int argc, char ** argv) {
}
}
const bool add_bos = llama_add_bos_token(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(!llama_add_eos_token(model));
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
}
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
@ -270,7 +271,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
embd_inp.push_back(llama_vocab_bos(vocab));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
LOG_ERR("input is empty\n");
@ -496,7 +497,7 @@ int main(int argc, char ** argv) {
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(model);
decoder_start_token_id = llama_vocab_bos(vocab);
}
embd_inp.clear();
@ -743,7 +744,7 @@ int main(int argc, char ** argv) {
}
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
LOG_DBG("found an EOG token\n");
if (params.interactive) {
@ -777,7 +778,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) {
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
embd_inp.push_back(llama_vocab_bos(vocab));
}
std::string buffer;
@ -831,8 +832,8 @@ int main(int argc, char ** argv) {
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
llama_token eot = llama_token_eot(model);
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
llama_token eot = llama_vocab_eot(vocab);
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
need_insert_eot = false;
}
@ -867,7 +868,7 @@ int main(int argc, char ** argv) {
}
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
LOG(" [end of text]\n");
break;
}

View file

@ -98,7 +98,7 @@ struct slot_params {
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_lora_adapter_info> lora;
std::vector<common_adapter_lora_info> lora;
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
@ -198,15 +198,17 @@ struct server_task {
bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_lora_adapter_info> set_lora;
std::vector<common_adapter_lora_info> set_lora;
server_task(server_task_type type) : type(type) {}
static slot_params params_from_json_cmpl(
const llama_model * model,
const llama_context * ctx,
const common_params & params_base,
const json & data) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
slot_params params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
@ -329,7 +331,7 @@ struct server_task {
const auto & logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array()) {
const int n_vocab = llama_n_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & el : *logit_bias) {
// TODO: we may want to throw errors here, in case "el" is incorrect
if (el.is_array() && el.size() == 2) {
@ -348,7 +350,7 @@ struct server_task {
params.sampling.logit_bias.push_back({tok, bias});
}
} else if (el[0].is_string()) {
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
for (auto tok : toks) {
params.sampling.logit_bias.push_back({tok, bias});
}
@ -1131,7 +1133,7 @@ struct server_slot {
common_speculative * spec = nullptr;
std::vector<common_lora_adapter_info> lora;
std::vector<common_adapter_lora_info> lora;
// the index relative to completion multi-task request
size_t index = 0;
@ -1633,6 +1635,8 @@ struct server_context {
llama_model * model = nullptr;
llama_context * ctx = nullptr;
const llama_vocab * vocab = nullptr;
llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
@ -1690,10 +1694,12 @@ struct server_context {
return false;
}
vocab = llama_model_get_vocab(model);
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_add_bos_token(model);
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
@ -1736,7 +1742,8 @@ struct server_context {
bool validate_builtin_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}};
int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
const char * tmpl = llama_model_chat_template(model);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
@ -1756,7 +1763,7 @@ struct server_context {
if (model_dft) {
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
if (slot.ctx_dft == nullptr) {
SRV_ERR("%s", "failed to create draft context\n");
return;
@ -1891,7 +1898,7 @@ struct server_context {
}
if (slot.params.ignore_eos && has_eos_token) {
slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
}
{
@ -2047,14 +2054,14 @@ struct server_context {
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
}
if (llama_token_is_eog(model, result.tok)) {
if (llama_vocab_is_eog(vocab, result.tok)) {
slot.stop = STOP_TYPE_EOS;
slot.has_next_token = false;
SLT_DBG(slot, "%s", "stopped by EOS\n");
}
const auto n_ctx_train = llama_n_ctx_train(model);
const auto n_ctx_train = llama_model_n_ctx_train(model);
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
slot.truncated = true;
@ -2074,7 +2081,7 @@ struct server_context {
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
size_t n_probs = slot.params.sampling.n_probs;
size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
size_t n_vocab = llama_vocab_n_tokens(vocab);
if (post_sampling) {
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
const size_t max_probs = cur_p->size;
@ -2225,7 +2232,7 @@ struct server_context {
res->n_tokens = slot.n_prompt_tokens;
res->oaicompat = slot.params.oaicompat;
const int n_embd = llama_n_embd(model);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embd_res(n_embd, 0.0f);
@ -2927,7 +2934,7 @@ struct server_context {
// make sure we're in the right embedding mode
llama_set_embeddings(ctx, slot_batched->is_non_causal());
// apply lora, only need to do it once per batch
common_lora_adapters_apply(ctx, slot_batched->lora);
common_set_adapter_lora(ctx, slot_batched->lora);
}
// process the created batch of tokens
@ -3129,12 +3136,12 @@ struct server_context {
json model_meta() const {
return json {
{"vocab_type", llama_vocab_type (model)},
{"n_vocab", llama_n_vocab (model)},
{"n_ctx_train", llama_n_ctx_train (model)},
{"n_embd", llama_n_embd (model)},
{"n_params", llama_model_n_params(model)},
{"size", llama_model_size (model)},
{"vocab_type", llama_vocab_type (vocab)},
{"n_vocab", llama_vocab_n_tokens (vocab)},
{"n_ctx_train", llama_model_n_ctx_train(model)},
{"n_embd", llama_model_n_embd (model)},
{"n_params", llama_model_n_params (model)},
{"size", llama_model_size (model)},
};
}
};
@ -3639,7 +3646,7 @@ int main(int argc, char ** argv) {
std::vector<server_task> tasks;
try {
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true);
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
server_task task = server_task(type);
@ -3649,7 +3656,6 @@ int main(int argc, char ** argv) {
task.prompt_tokens = std::move(tokenized_prompts[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server.model,
ctx_server.ctx,
ctx_server.params_base,
data);
@ -3745,13 +3751,13 @@ int main(int argc, char ** argv) {
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
// check model compatibility
std::string err;
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "prefix token is missing. ";
}
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "suffix token is missing. ";
}
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "middle token is missing. ";
}
if (!err.empty()) {
@ -3797,10 +3803,10 @@ int main(int argc, char ** argv) {
data["input_extra"] = input_extra; // default to empty array if it's not exist
std::string prompt = json_value(data, "prompt", std::string());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
data["prompt"] = format_infill(
ctx_server.ctx,
ctx_server.vocab,
data.at("input_prefix"),
data.at("input_suffix"),
data.at("input_extra"),
@ -3857,7 +3863,7 @@ int main(int argc, char ** argv) {
const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false);
llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true);
llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
if (with_pieces) {
for (const auto& token : tokens) {
@ -3933,7 +3939,7 @@ int main(int argc, char ** argv) {
}
}
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input
if (tokens.empty()) {
@ -4033,20 +4039,20 @@ int main(int argc, char ** argv) {
return;
}
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0];
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
// create and queue the task
json responses = json::array();
bool error = false;
{
std::vector<server_task> tasks;
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true);
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) {
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]);
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
tasks.push_back(task);
}

View file

@ -118,7 +118,7 @@ static json json_get_nested_values(const std::vector<std::string> & paths, const
* - only string, example: "string"
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
*/
static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
llama_tokens prompt_tokens;
@ -131,10 +131,10 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
llama_tokens p;
if (first) {
p = common_tokenize(ctx, s, add_special, parse_special);
p = common_tokenize(vocab, s, add_special, parse_special);
first = false;
} else {
p = common_tokenize(ctx, s, false, parse_special);
p = common_tokenize(vocab, s, false, parse_special);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@ -148,7 +148,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
}
} else {
auto s = json_prompt.template get<std::string>();
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
}
return prompt_tokens;
@ -166,11 +166,11 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
*/
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
std::vector<llama_tokens> result;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed
result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
} else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens
result.push_back(json_prompt.get<llama_tokens>());
@ -179,7 +179,7 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
result.reserve(json_prompt.size());
for (const auto & p : json_prompt) {
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
} else if (json_is_array_of_numbers(p)) {
// array of tokens
result.push_back(p.get<llama_tokens>());
@ -231,21 +231,23 @@ static size_t validate_utf8(const std::string& text) {
//
// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result;
result.reserve(doc.size() + query.size() + 4);
result.push_back(llama_token_bos(model));
result.push_back(llama_vocab_bos(vocab));
result.insert(result.end(), query.begin(), query.end());
result.push_back(llama_token_eos(model));
result.push_back(llama_token_sep(model));
result.push_back(llama_vocab_eos(vocab));
result.push_back(llama_vocab_sep(vocab));
result.insert(result.end(), doc.begin(), doc.end());
result.push_back(llama_token_eos(model));
result.push_back(llama_vocab_eos(vocab));
return result;
}
// format infill task
static llama_tokens format_infill(
const llama_context * ctx,
const llama_vocab * vocab,
const json & input_prefix,
const json & input_suffix,
const json & input_extra,
@ -272,15 +274,14 @@ static llama_tokens format_infill(
llama_tokens extra_tokens;
extra_tokens.reserve(n_ctx);
auto model = llama_get_model(ctx);
auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
// TODO: make project name an input
static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
extra_tokens.push_back(llama_token_fim_rep(model));
extra_tokens.push_back(llama_vocab_fim_rep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
}
for (const auto & chunk : input_extra) {
@ -288,28 +289,28 @@ static llama_tokens format_infill(
const std::string text = json_value(chunk, "text", std::string());
const std::string filename = json_value(chunk, "filename", std::string("tmp"));
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
} else {
// chunk separator in binary form to avoid confusing the AI
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
}
const auto chunk_tokens = common_tokenize(ctx, text, false, false);
const auto chunk_tokens = common_tokenize(vocab, text, false, false);
extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
}
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
// TODO: current filename
static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
}
@ -325,15 +326,15 @@ static llama_tokens format_infill(
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
tokens_suffix.resize(n_suffix_take);
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
if (llama_add_bos_token(model)) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
if (llama_vocab_get_add_bos(vocab)) {
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
}
SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
@ -342,7 +343,7 @@ static llama_tokens format_infill(
embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
embd_inp.push_back(llama_token_fim_mid(model));
embd_inp.push_back(llama_vocab_fim_mid(vocab));
return embd_inp;
}
@ -764,14 +765,18 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
return data;
}
static std::string safe_json_to_str(json data) {
static std::string safe_json_to_str(const json & data) {
return data.dump(-1, ' ', false, json::error_handler_t::replace);
}
static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
std::vector<llama_token_data> cur;
const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -799,8 +804,8 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
}
static bool are_lora_equal(
const std::vector<common_lora_adapter_info> & l1,
const std::vector<common_lora_adapter_info> & l2) {
const std::vector<common_adapter_lora_info> & l1,
const std::vector<common_adapter_lora_info> & l2) {
if (l1.size() != l2.size()) {
return false;
}
@ -814,10 +819,10 @@ static bool are_lora_equal(
}
// parse lora config from JSON request, returned a copy of lora_base with updated scale
static std::vector<common_lora_adapter_info> parse_lora_request(
const std::vector<common_lora_adapter_info> & lora_base,
static std::vector<common_adapter_lora_info> parse_lora_request(
const std::vector<common_adapter_lora_info> & lora_base,
const json & data) {
std::vector<common_lora_adapter_info> lora(lora_base);
std::vector<common_adapter_lora_info> lora(lora_base);
int max_idx = lora.size();
// clear existing value

View file

@ -414,38 +414,15 @@ static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
prompt.insert(prompt.end(), tokens.begin(), tokens.end());
}
static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
auto tmp = common_tokenize(model, txt, add_special, parse_special);
static void prompt_add(llama_tokens & prompt, const llama_vocab * vocab, const std::string & txt, bool add_special, bool parse_special) {
auto tmp = common_tokenize(vocab, txt, add_special, parse_special);
prompt_add(prompt, tmp);
}
static void prompt_init(llama_tokens & prompt, const llama_model * model) {
static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
prompt.clear();
prompt_add(prompt, model, "<|im_start|>\n", true, true);
}
static std::vector<llama_token> prepare_guide_tokens(const llama_model * model, const std::string& str)
{
const std::string& delimiter = "<|text_sep|>";
std::vector<llama_token> result;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
std::string current_word = str.substr(start, end - start);
auto tmp = common_tokenize(model, current_word, false, true);
result.push_back(tmp[0]);
start = end + delimiter.length();
end = str.find(delimiter, start);
}
// Add the last part
std::string current_word = str.substr(start);
auto tmp = common_tokenize(model, current_word, false, true);
result.push_back(tmp[0]);
return result;
prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
}
int main(int argc, char ** argv) {
@ -485,6 +462,8 @@ int main(int argc, char ** argv) {
model_ttc = llama_init_ttc.model.get();
ctx_ttc = llama_init_ttc.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
// TODO: refactor in a common struct
params.model = params.vocoder.model;
params.model_url = params.vocoder.model_url;
@ -515,7 +494,6 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
std::vector<llama_token> codes;
std::vector<llama_token> guide_tokens;
// process prompt and generate voice codes
{
@ -523,24 +501,20 @@ int main(int argc, char ** argv) {
std::vector<llama_token> prompt_inp;
prompt_init(prompt_inp, model_ttc);
prompt_init(prompt_inp, vocab);
prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
prompt_add(prompt_inp, vocab, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
// convert the input text into the necessary format expected by OuteTTS
{
std::string prompt_clean = process_text(params.prompt);
if(params.vocoder.use_guide_tokens)
{
guide_tokens = prepare_guide_tokens(model_ttc,prompt_clean);
}
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
prompt_add(prompt_inp, vocab, prompt_clean, false, true);
}
prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
// disabled to save time on tokenizing each time
// TODO: load voices from the json files
@ -577,7 +551,7 @@ it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
auto tmp = common_tokenize(model_ttc, voice_data, false, true);
auto tmp = common_tokenize(vocab, voice_data, false, true);
printf("\n\n");
for (int i = 0; i < tmp.size(); ++i) {
printf("%d, ", tmp[i]);
@ -743,8 +717,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
int n_past = batch.n_tokens;
int n_decode = 0;
bool next_token_uses_guide_token = true;
while (n_decode <= n_predict) {
// prepare the next batch
common_batch_clear(batch);
@ -756,18 +728,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
continue;
}
llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
if(!guide_tokens.empty() && next_token_uses_guide_token && !llama_token_is_control(model_ttc, new_token_id) && !llama_token_is_eog(model_ttc, new_token_id))
{
llama_token guide_token = guide_tokens[0];
guide_tokens.erase(guide_tokens.begin());
new_token_id = guide_token; //ensure correct word fragment is used
}
//this is the token id that always precedes a new word
next_token_uses_guide_token = (new_token_id == 198);
const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
common_sampler_accept(smpl[i], new_token_id, true);
@ -776,9 +737,9 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
const auto * cands = common_sampler_get_candidates(smpl[i]);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) {
if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
std::string reason;
if (llama_token_is_eog(model_ttc, new_token_id)) {
if (llama_vocab_is_eog(vocab, new_token_id)) {
reason = "eos";
} else {
reason = "n_predict";
@ -914,7 +875,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
#if 1
// spectral operations
const int n_embd = llama_n_embd(model_cts);
const int n_embd = llama_model_n_embd(model_cts);
const float * embd = llama_get_embeddings(ctx_cts);
auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
@ -966,4 +927,4 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
llama_backend_free();
return 0;
}
}