Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	README.md
#	build-xcframework.sh
#	common/CMakeLists.txt
#	examples/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-metal/ggml-metal.m
#	ggml/src/ggml-metal/ggml-metal.metal
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-sycl/backend.hpp
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/vecdotq.hpp
#	scripts/compare-llama-bench.py
#	src/CMakeLists.txt
#	src/llama-model.cpp
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-opt.cpp
#	tools/llama-bench/README.md
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/README.md
#	tools/mtmd/clip.cpp
#	tools/rpc/rpc-server.cpp
#	tools/server/CMakeLists.txt
#	tools/server/README.md
This commit is contained in:
Concedo 2025-05-13 00:28:35 +08:00
commit 21e31e255b
90 changed files with 4390 additions and 1388 deletions

View file

@ -7,6 +7,7 @@
#include "log.h"
#include "sampling.h"
#include "speculative.h"
#include "mtmd.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@ -197,8 +198,8 @@ struct server_task {
int id_target = -1;
// used by SERVER_TASK_TYPE_INFERENCE
slot_params params;
llama_tokens prompt_tokens;
slot_params params;
server_tokens prompt_tokens;
int id_selected_slot = -1;
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
@ -1248,6 +1249,9 @@ struct server_slot {
llama_context * ctx = nullptr;
llama_context * ctx_dft = nullptr;
// multimodal
mtmd_context * mctx = nullptr;
common_speculative * spec = nullptr;
std::vector<common_adapter_lora_info> lora;
@ -1275,14 +1279,14 @@ struct server_slot {
int32_t n_prompt_tokens_processed = 0;
// input prompt tokens
llama_tokens prompt_tokens;
server_tokens prompt_tokens;
size_t last_nl_pos = 0;
std::string generated_text;
llama_tokens generated_tokens;
llama_tokens cache_tokens;
server_tokens cache_tokens;
std::vector<completion_token_output> generated_token_probs;
@ -1476,7 +1480,7 @@ struct server_slot {
{"is_processing", is_processing()},
{"non_causal", is_non_causal()},
{"params", params.to_json()},
{"prompt", common_detokenize(ctx, prompt_tokens)},
{"prompt", prompt_tokens.detokenize(ctx, true)},
{"next_token",
{
{"has_next_token", has_next_token},
@ -1849,13 +1853,16 @@ struct server_context {
llama_model * model = nullptr;
llama_context * ctx = nullptr;
// multimodal
mtmd_context * mctx = nullptr;
const llama_vocab * vocab = nullptr;
llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
llama_batch batch = {};
llama_batch batch {};
bool clean_kv_cache = true;
bool add_bos_token = true;
@ -1878,6 +1885,8 @@ struct server_context {
common_chat_templates_ptr chat_templates;
~server_context() {
mtmd_free(mctx);
// Clear any sampling context
for (server_slot & slot : slots) {
common_sampler_free(slot.smpl);
@ -1965,6 +1974,36 @@ struct server_context {
chat_templates = common_chat_templates_init(model, "chatml");
}
std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()) {
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params_base.mmproj_use_gpu;
mparams.print_timings = false;
mparams.n_threads = params_base.cpuparams.n_threads;
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
if (mctx == nullptr) {
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
return false;
}
SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
if (params_base.ctx_shift) {
params_base.ctx_shift = false;
SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
}
if (params_base.n_cache_reuse) {
params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
}
if (!params_base.speculative.model.path.empty()) {
SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
return false;
}
}
return true;
}
@ -1980,6 +2019,8 @@ struct server_context {
slot.ctx = ctx;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params_base.n_predict;
slot.mctx = mctx;
slot.cache_tokens.has_mtmd = mctx != nullptr;
if (model_dft) {
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
@ -2016,8 +2057,6 @@ struct server_context {
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
{
const int32_t n_batch = llama_n_batch(ctx);
// only a single seq_id per token is needed
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
}
@ -2054,7 +2093,7 @@ struct server_context {
}
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
// fraction of the common subsequence length compared to the current slot's prompt length
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@ -2096,18 +2135,6 @@ struct server_context {
return ret;
}
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & token : tokens) {
if (token < 0 || token >= n_vocab) {
return false;
}
}
return true;
}
bool launch_slot_with_task(server_slot & slot, server_task && task) {
slot.reset();
slot.id_task = task.id;
@ -2122,8 +2149,7 @@ struct server_context {
slot.lora = slot.params.lora;
}
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
if (!can_detokenize) {
if (!slot.prompt_tokens.validate(ctx)) {
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
return false;
}
@ -2385,6 +2411,15 @@ struct server_context {
queue_results.send(std::move(res));
}
// if multimodal is enabled, send an error and return false
bool ensure_no_mtmd(const int id_task) {
if (mctx) {
send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
return false;
}
return true;
}
void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
auto res = std::make_unique<server_task_result_cmpl_partial>();
@ -2424,7 +2459,7 @@ struct server_context {
res->content = std::move(slot.generated_text);
res->tokens = std::move(slot.generated_tokens);
res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
res->prompt = slot.prompt_tokens.detokenize(ctx, true);
res->response_fields = std::move(slot.params.response_fields);
res->truncated = slot.truncated;
@ -2734,6 +2769,10 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_SAVE:
{
if (!ensure_no_mtmd(task.id)) {
break;
}
int id_slot = task.slot_action.slot_id;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
@ -2753,7 +2792,8 @@ struct server_context {
std::string filename = task.slot_action.filename;
std::string filepath = task.slot_action.filepath;
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
const int64_t t_end = ggml_time_us();
const double t_save_ms = (t_end - t_start) / 1000.0;
@ -2770,6 +2810,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
if (!ensure_no_mtmd(task.id)) break;
int id_slot = task.slot_action.slot_id;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
@ -2788,15 +2829,18 @@ struct server_context {
std::string filename = task.slot_action.filename;
std::string filepath = task.slot_action.filepath;
slot->cache_tokens.resize(slot->n_ctx);
llama_tokens tokens;
tokens.resize(slot->n_ctx);
size_t token_count = 0;
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
if (nread == 0) {
slot->cache_tokens.resize(0);
slot->cache_tokens.clear(); // KV may already been invalidated?
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
break;
}
slot->cache_tokens.resize(token_count);
tokens.resize(token_count);
slot->cache_tokens.clear();
slot->cache_tokens.insert(tokens);
const int64_t t_end = ggml_time_us();
const double t_restore_ms = (t_end - t_start) / 1000.0;
@ -2813,6 +2857,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_ERASE:
{
if (!ensure_no_mtmd(task.id)) break;
int id_slot = task.slot_action.slot_id;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
@ -2844,6 +2889,7 @@ struct server_context {
res->id = task.id;
queue_results.send(std::move(res));
} break;
}
}
@ -2889,6 +2935,12 @@ struct server_context {
continue;
}
if (mctx) {
// we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
// we don't support ctx_shift because an image chunk may contains multiple tokens
GGML_ABORT("not supported by multimodal");
}
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = slot.n_past - n_keep;
@ -2900,11 +2952,14 @@ struct server_context {
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
new_tokens[i - n_discard] = new_tokens[i];
}
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
new_tokens.resize(slot.cache_tokens.size() - n_discard);
slot.cache_tokens.clear();
slot.cache_tokens.insert(new_tokens);
}
slot.n_past -= n_discard;
@ -2982,7 +3037,7 @@ struct server_context {
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
// print prompt tokens (for debugging)
if (1) {
/*if (1) {
// first 16 tokens (avoid flooding logs)
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2992,7 +3047,7 @@ struct server_context {
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
}
}
}*/
// empty prompt passed -> release the slot and send empty response
if (prompt_tokens.empty()) {
@ -3034,21 +3089,27 @@ struct server_context {
// if input prompt is too big, truncate it
if (slot.n_prompt_tokens >= slot.n_ctx) {
if (mctx) {
// we should never reach this
GGML_ABORT("not supported by multimodal");
}
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
llama_tokens new_tokens(
prompt_tokens.begin(),
prompt_tokens.begin() + slot.params.n_keep);
curr_tokens.begin(),
curr_tokens.begin() + slot.params.n_keep);
new_tokens.insert(
new_tokens.end(),
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
prompt_tokens.end());
curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
curr_tokens.end());
prompt_tokens = std::move(new_tokens);
prompt_tokens.clear();
prompt_tokens.insert(new_tokens);
slot.truncated = true;
slot.n_prompt_tokens = prompt_tokens.size();
@ -3060,13 +3121,18 @@ struct server_context {
if (slot.params.cache_prompt) {
// reuse any previously computed tokens that are common with the new prompt
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
// reuse chunks from the cached prompt by shifting their KV cache in the new position
if (params_base.n_cache_reuse > 0) {
size_t head_c = slot.n_past; // cache
size_t head_p = slot.n_past; // current prompt
if (mctx) {
// we should never reach this
GGML_ABORT("not supported by multimodal");
}
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
while (head_c < slot.cache_tokens.size() &&
@ -3092,7 +3158,7 @@ struct server_context {
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
slot.n_past++;
}
@ -3140,21 +3206,52 @@ struct server_context {
// remove the non-common part from the cache
slot.cache_tokens.resize(slot.n_past);
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
// process the image
int32_t new_n_past;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
int32_t n_pos = new_n_past - slot.n_past;
if (res != 0) {
SLT_ERR(slot, "failed to process image, res = %d\n", res);
slot.release();
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
continue;
}
if (slot.params.cache_prompt) {
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
slot.cache_tokens.push_back(chunk.get()); // copy
}
slot.n_past += n_pos;
slot.n_prompt_tokens_processed += n_pos;
}
// add prompt tokens for processing in the current batch
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
// get next token to process
llama_token cur_tok = slot.prompt_tokens[slot.n_past];
if (cur_tok == LLAMA_TOKEN_NULL) {
break; // end of text chunk
}
// without pooling, we want to output the embeddings for all the tokens in the batch
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
slot.cache_tokens.push_back(cur_tok);
}
slot.n_prompt_tokens_processed++;
slot.n_past++;
}
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
// entire prompt has been processed
@ -3162,12 +3259,16 @@ struct server_context {
slot.state = SLOT_STATE_DONE_PROMPT;
GGML_ASSERT(batch.n_tokens > 0);
GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
common_sampler_reset(slot.smpl);
// Process all prompt tokens through sampler system
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
common_sampler_accept(slot.smpl, prompt_tokens[i], false);
llama_token id = slot.prompt_tokens[i];
if (id != LLAMA_TOKEN_NULL) {
common_sampler_accept(slot.smpl, id, false);
}
}
// extract the logits only for the last token
@ -3320,6 +3421,11 @@ struct server_context {
continue;
}
if (mctx) {
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
GGML_ABORT("not supported by multimodal");
}
// determine the max draft that fits the current slot state
int n_draft_max = slot.params.speculative.n_max;
@ -3346,7 +3452,8 @@ struct server_context {
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
// keep track of total number of tokens generated in the draft
slot.n_draft_total += draft.size();
@ -3380,7 +3487,7 @@ struct server_context {
slot.n_draft_accepted += ids.size() - 1;
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
@ -3903,6 +4010,7 @@ int main(int argc, char ** argv) {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model.path },
{ "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@ -3950,9 +4058,10 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
const std::vector<raw_buffer> & files,
const std::function<bool()> & is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
oaicompat_type oaicompat) -> void {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
if (ctx_server.params_base.embedding) {
@ -3969,15 +4078,69 @@ int main(int argc, char ** argv) {
// TODO: this log can become very long, put it behind a flag or think about a more compact format
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
// process files
mtmd::bitmaps bitmaps;
const bool has_mtmd = ctx_server.mctx != nullptr;
{
if (!has_mtmd && !files.empty()) {
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image");
}
// calculate bitmap hash (for KV caching)
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
}
// process prompt
std::vector<server_tokens> inputs;
if (oaicompat && !prompt.is_string()) {
throw std::runtime_error("prompt must be a string");
}
if (oaicompat && has_mtmd) {
// multimodal
std::string prompt_str = prompt.get<std::string>();
mtmd_input_text inp_txt = {
prompt_str.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
server_tokens tmp(chunks, true);
inputs.push_back(std::move(tmp));
} else {
// non-multimodal version
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
for (auto & p : tokenized_prompts) {
auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
inputs.push_back(std::move(tmp));
}
}
tasks.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) {
server_task task = server_task(type);
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = std::move(tokenized_prompts[i]);
task.prompt_tokens = std::move(inputs[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server.ctx,
ctx_server.params_base,
@ -4059,9 +4222,11 @@ int main(int argc, char ** argv) {
const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
json data = json::parse(req.body);
return handle_completions_impl(
std::vector<raw_buffer> files; // dummy
handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
files,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE);
@ -4069,9 +4234,11 @@ int main(int argc, char ** argv) {
const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
json data = oaicompat_completion_params_parse(json::parse(req.body));
return handle_completions_impl(
std::vector<raw_buffer> files; // dummy
handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
files,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_COMPLETION);
@ -4146,9 +4313,11 @@ int main(int argc, char ** argv) {
tokenized_prompts[0]
);
return handle_completions_impl(
std::vector<raw_buffer> files; // dummy
handle_completions_impl(
SERVER_TASK_TYPE_INFILL,
data,
files,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
@ -4162,11 +4331,19 @@ int main(int argc, char ** argv) {
}
auto body = json::parse(req.body);
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
std::vector<raw_buffer> files;
json data = oaicompat_completion_params_parse(
body,
params.use_jinja,
params.reasoning_format,
ctx_server.chat_templates.get(),
ctx_server.mctx,
files);
return handle_completions_impl(
handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
files,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_CHAT);
@ -4175,7 +4352,14 @@ int main(int argc, char ** argv) {
// same with handle_chat_completions, but without inference part
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
auto body = json::parse(req.body);
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
std::vector<raw_buffer> files; // dummy, unused
json data = oaicompat_completion_params_parse(
body,
params.use_jinja,
params.reasoning_format,
ctx_server.chat_templates.get(),
ctx_server.mctx,
files);
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
};
@ -4280,7 +4464,7 @@ int main(int argc, char ** argv) {
}
}
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input
if (tokens.empty()) {
@ -4300,7 +4484,7 @@ int main(int argc, char ** argv) {
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = std::move(tokenized_prompts[i]);
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
// OAI-compat
task.params.oaicompat = oaicompat;
@ -4394,13 +4578,14 @@ int main(int argc, char ** argv) {
std::unordered_set<int> task_ids;
{
std::vector<server_task> tasks;
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) {
auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
task.id = ctx_server.queue_tasks.get_new_id();
task.index = i;
task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
tasks.push_back(std::move(task));
}

View file

@ -0,0 +1,59 @@
import pytest
from utils import *
import base64
import requests
server: ServerProcess
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
response = requests.get(IMG_URL_0)
response.raise_for_status() # Raise an exception for bad status codes
IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinygemma3()
@pytest.mark.parametrize(
"prompt, image_url, success, re_content",
[
# test model is trained on CIFAR-10, but it's quite dumb due to small size
("What is this:\n", IMG_URL_0, True, "(cat)+"),
("What is this:\n", "IMG_BASE64_0", True, "(cat)+"), # exceptional, so that we don't cog up the log
("What is this:\n", IMG_URL_1, True, "(frog)+"),
("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache
("What is this:\n", "malformed", False, None),
("What is this:\n", "https://google.com/404", False, None), # non-existent image
("What is this:\n", "https://ggml.ai", False, None), # non-image data
]
)
def test_vision_chat_completion(prompt, image_url, success, re_content):
global server
server.start(timeout_seconds=60) # vision model may take longer to load due to download size
if image_url == "IMG_BASE64_0":
image_url = IMG_BASE64_0
res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {
"url": image_url,
}},
]},
],
})
if success:
assert res.status_code == 200
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"])
else:
assert res.status_code != 200

View file

@ -88,6 +88,7 @@ class ServerProcess:
chat_template: str | None = None
chat_template_file: str | None = None
server_path: str | None = None
mmproj_url: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -194,6 +195,8 @@ class ServerProcess:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
server_args.extend(["--chat-template-file", self.chat_template_file])
if self.mmproj_url:
server_args.extend(["--mmproj-url", self.mmproj_url])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"tests: starting server with: {' '.join(args)}")
@ -379,6 +382,21 @@ class ServerPreset:
server.server_reranking = True
return server
@staticmethod
def tinygemma3() -> ServerProcess:
server = ServerProcess()
# mmproj is already provided by HF registry API
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
server.model_hf_file = "tinygemma3-Q8_0.gguf"
server.mmproj_url = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf"
server.model_alias = "tinygemma3"
server.n_ctx = 1024
server.n_batch = 32
server.n_slots = 2
server.n_predict = 4
server.seed = 42
return server
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
"""

View file

@ -3,7 +3,9 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "arg.h" // common_remote_get_content
#include "base64.hpp"
#include "mtmd.h"
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@ -21,6 +23,7 @@
#include <string>
#include <vector>
#include <memory>
#include <cinttypes>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
using raw_buffer = std::vector<uint8_t>;
template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value
@ -386,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
return (isalnum(c) || (c == '+') || (c == '/'));
}
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
static inline raw_buffer base64_decode(const std::string & encoded_string) {
int i = 0;
int j = 0;
int in_ = 0;
@ -396,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
uint8_t char_array_4[4];
uint8_t char_array_3[3];
std::vector<uint8_t> ret;
raw_buffer ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
char_array_4[i++] = encoded_string[in_]; in_++;
@ -579,7 +584,9 @@ static json oaicompat_completion_params_parse(
const json & body, /* openai api json semantics */
bool use_jinja,
common_reasoning_format reasoning_format,
const struct common_chat_templates * tmpls)
const struct common_chat_templates * tmpls,
bool allow_non_text,
std::vector<raw_buffer> & out_files)
{
json llama_params;
@ -627,8 +634,77 @@ static json oaicompat_completion_params_parse(
}
}
// get input files
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
}
json messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
}
for (auto & msg : messages) {
json & content = msg.at("content");
if (content.is_string() || content.is_null()) {
continue;
}
if (!content.is_array()) {
throw std::runtime_error("Expected 'content' to be a string or an array");
}
for (auto & p : content) {
std::string type = json_value(p, "type", std::string());
json image_url = json_value(p, "image_url", json::object());
if (type == "image_url") {
if (!allow_non_text) {
throw std::runtime_error("image input is not supported by this server");
}
std::string url = json_value(image_url, "url", std::string());
if (string_starts_with(url, "http")) {
// download remote image
// TODO @ngxson : maybe make these params configurable
common_remote_params params;
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
params.max_size = 1024 * 1024 * 10; // 10MB
params.timeout = 10; // seconds
SRV_INF("downloading image from '%s'\n", url.c_str());
auto res = common_remote_get_content(url, params);
if (200 <= res.first && res.first < 300) {
SRV_INF("downloaded %ld bytes\n", res.second.size());
raw_buffer data;
data.insert(data.end(), res.second.begin(), res.second.end());
out_files.push_back(data);
} else {
throw std::runtime_error("Failed to download image");
}
} else {
// try to decode base64 image
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
if (parts.size() != 2) {
throw std::runtime_error("Invalid image_url.url value");
} else if (!string_starts_with(parts[0], "data:image/")) {
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
} else if (!string_ends_with(parts[0], "base64")) {
throw std::runtime_error("image_url.url must be base64 encoded");
} else {
auto base64_data = parts[1];
auto decoded_data = base64_decode(base64_data);
out_files.push_back(decoded_data);
}
}
// replace this chunk with a marker
p["type"] = "text";
p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
p.erase("image_url");
}
}
}
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
inputs.tools = common_chat_tools_parse_oaicompat(tools);
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
@ -935,3 +1011,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
return lora;
}
//
// utils for interacting with libmtmd
// (may need to refactor in near future)
//
/**
* server_tokens is a helper to manage the input tokens and image for the server.
* it is made this way to simplify the logic of KV cache management.
*/
struct server_tokens {
bool has_mtmd = false;
private: // disallow accessing these members directly, risking out-of-sync
// map a **start** position in tokens to the image chunk
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
// list of tokens
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
llama_tokens tokens;
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// pos 0 1 2 3 4 5 6 7 8 9
// map_pos_to_image will contain: {5, img0}, {8, img1}
public:
server_tokens() = default;
~server_tokens() = default;
// Prevent copying
server_tokens(const server_tokens&) = delete;
server_tokens& operator=(const server_tokens&) = delete;
// Allow moving (usually implicitly generated if members are movable)
server_tokens(server_tokens&&) = default;
server_tokens& operator=(server_tokens&&) = default;
// Allow accessing elements using [] operator
llama_token operator[](size_t index) { return tokens[index]; }
const llama_token& operator[](size_t index) const { return tokens[index]; }
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
push_back(mtmd_chunks[i]);
}
}
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
// for debugging
std::string str() const {
std::ostringstream oss;
oss << "tokens: ";
for (const auto & t : tokens) {
if (t == LLAMA_TOKEN_NULL) {
oss << "<embd> ";
} else {
oss << t << " ";
}
}
oss << "\n";
oss << "image pos: ";
for (const auto & it : map_pos_to_image) {
oss << it.first << ", ";
}
return oss.str();
}
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
auto it = map_pos_to_image.find(pos);
if (it != map_pos_to_image.end()) {
return it->second;
} else {
throw std::runtime_error("Chunk not found");
}
}
void push_back(llama_token tok) {
if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token");
}
tokens.emplace_back(tok);
}
// will create a copy of the chunk if it contains non-text data
void push_back(const mtmd_input_chunk * chunk) {
auto type = mtmd_input_chunk_get_type(chunk);
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
GGML_ASSERT(has_mtmd);
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
llama_pos start_pos = tokens.size();
for (int i = 0; i < n_pos; ++i) {
tokens.emplace_back(LLAMA_TOKEN_NULL);
}
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_image[start_pos] = std::move(new_chunk);
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
for (size_t i = 0; i < n_tokens; ++i) {
push_back(text_tokens[i]);
}
} else {
GGML_ABORT("Invalid chunk type");
}
}
// for compatibility with context shift and prompt truncation
void insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
}
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens() const {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
return tokens;
}
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens[pos] = id;
}
size_t size() const {
return tokens.size();
}
bool empty() const {
return tokens.empty();
}
void clear() {
tokens.clear();
}
void resize(size_t n) {
GGML_ASSERT(n <= tokens.size());
if (has_mtmd) {
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// n 1 2 3 4 5 6 7 8 9 10
// allowed to resize ^ ^
// disallowed to resize ^ ^ ^
if (n > 0) {
llama_token last_token = tokens[n - 1];
// make sure we never remove tokens in the middle of an image
if (last_token == LLAMA_TOKEN_NULL) {
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
}
}
// remove all image chunks that are not used anymore
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
llama_pos pos = it->first;
if (pos >= (llama_pos)n) {
it = map_pos_to_image.erase(it);
} else {
++it;
}
}
}
tokens.resize(n);
}
std::string detokenize(const llama_context * ctx, bool special) const {
llama_tokens text_tokens;
text_tokens.reserve(tokens.size());
for (const auto & t : tokens) {
if (t != LLAMA_TOKEN_NULL) {
text_tokens.push_back(t);
}
}
return common_detokenize(ctx, text_tokens, special);
}
size_t get_common_prefix(const server_tokens & b) const {
size_t max_idx = std::min(tokens.size(), b.tokens.size());
for (size_t i = 0; i < max_idx; ++i) {
auto & ai = tokens[i];
auto & bi = b.tokens[i];
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
GGML_ASSERT(has_mtmd);
const auto & a_chunk = find_chunk(i);
const auto & b_chunk = b.find_chunk(i);
GGML_ASSERT(a_chunk && b_chunk);
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
std::string ai_id = mtmd_image_tokens_get_id(a_img);
std::string bi_id = mtmd_image_tokens_get_id(b_img);
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
if (ai_id == bi_id && a_pos == b_pos) {
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
i += a_pos - 1; // will be +1 by the for loop
continue;
} else {
return i;
}
} else if (ai == bi) {
continue;
} else {
return i;
}
}
return max_idx; // all tokens are equal
}
// make sure all text tokens are within the vocab range
bool validate(const struct llama_context * ctx) const {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
for (size_t i = 0; i < tokens.size(); ++i) {
auto & t = tokens[i];
if (t == LLAMA_TOKEN_NULL) {
try {
const auto & chunk = find_chunk(i);
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
i += n_pos - 1; // will be +1 by the for loop
} catch (const std::exception & e) {
return false;
}
} else if (t < 0 || t >= n_vocab) {
return false;
}
}
return true;
}
// encode and decode the image chunk
int32_t process_chunk(
llama_context * ctx,
mtmd_context * mctx,
llama_pos n_past,
int32_t seq_id,
llama_pos & n_pos_out) {
auto it = map_pos_to_image.find(n_past);
if (it == map_pos_to_image.end()) {
throw std::runtime_error("Chunk not found");
}
SRV_INF("%s\n", "processing image...");
int32_t n_batch = llama_n_batch(ctx);
int64_t t0 = ggml_time_ms();
llama_pos new_n_past = n_past;
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
it->second.get(), // chunk
n_past,
seq_id,
n_batch,
true, // logits last
&new_n_past);
SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
if (result != 0) {
LOG_ERR("mtmd_helper_eval failed with status %d", result);
n_pos_out = n_past;
return result;
}
n_pos_out = new_n_past;
return 0;
}
};
// Computes FNV-1a hash of the data
static std::string fnv_hash(const uint8_t * data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;
for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return std::to_string(hash);
}