Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/release.yml
#	.github/workflows/server.yml
#	README.md
#	docs/build.md
#	docs/install.md
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/vecdotq.hpp
#	tests/test-backend-ops.cpp
#	tests/test-chat.cpp
This commit is contained in:
Concedo 2025-06-05 11:03:34 +08:00
commit bc89b465a8
35 changed files with 1070 additions and 288 deletions

View file

@ -70,6 +70,7 @@ struct mtmd_cli_context {
llama_model * model;
llama_context * lctx;
const llama_vocab * vocab;
common_sampler * smpl;
llama_batch batch;
int n_batch;
@ -89,8 +90,9 @@ struct mtmd_cli_context {
model = llama_init.model.get();
lctx = llama_init.context.get();
vocab = llama_model_get_vocab(model);
smpl = common_sampler_init(model, params.sampling);
n_threads = params.cpuparams.n_threads;
batch = llama_batch_init(params.n_batch, 0, 1);
batch = llama_batch_init(1, 0, 1); // batch for next token generation
n_batch = params.n_batch;
if (!model || !lctx) {
@ -118,6 +120,11 @@ struct mtmd_cli_context {
}
}
~mtmd_cli_context() {
llama_batch_free(batch);
common_sampler_free(smpl);
}
void init_vision_context(common_params & params) {
const char * clip_path = params.mmproj.path.c_str();
mtmd_context_params mparams = mtmd_context_params_default();
@ -153,7 +160,7 @@ struct mtmd_cli_context {
}
};
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
if (i > n_predict || !g_is_generating || g_is_interrupted) {
@ -161,9 +168,9 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
break;
}
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
generated_tokens.push_back(token_id);
common_sampler_accept(smpl, token_id, true);
common_sampler_accept(ctx.smpl, token_id, true);
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
LOG("\n");
@ -261,7 +268,6 @@ int main(int argc, char ** argv) {
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
// Ctrl+C handling
@ -300,7 +306,7 @@ int main(int argc, char ** argv) {
if (eval_message(ctx, msg, true)) {
return 1;
}
if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
return 1;
}
@ -366,7 +372,7 @@ int main(int argc, char ** argv) {
return 1;
}
if (g_is_interrupted) break;
if (generate_response(ctx, smpl, n_predict)) {
if (generate_response(ctx, n_predict)) {
return 1;
}
content.clear();

View file

@ -311,6 +311,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
GGML_ABORT("chunk type not supported");
}
llama_batch_free(text_batch);
return 0;
}

View file

@ -360,7 +360,7 @@ struct server_task {
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
}
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
}
@ -2016,6 +2016,11 @@ struct server_context {
params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
}
if (!params_base.speculative.model.path.empty()) {
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
return false;
}
}
return true;
@ -3203,9 +3208,7 @@ struct server_context {
}
} else {
// if we don't cache the prompt, we have to remove the entire KV cache
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
slot.n_past = 0;
slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
}
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
@ -3220,7 +3223,6 @@ struct server_context {
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
slot.n_past = 0;
}
}

View file

@ -499,13 +499,12 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr
@pytest.mark.slow
@pytest.mark.parametrize("n_predict,reasoning_format,stream,expect_reasoning_content,expect_content,hf_repo,template_override", [
(128, 'deepseek', CompletionMode.NORMAL, None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(128, None, CompletionMode.NORMAL, None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(1024, 'deepseek', CompletionMode.NORMAL, "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
(1024, 'deepseek', CompletionMode.STREAMED, None, "^<think>I need to calculate [\\s\\S]*?</think>To find the sum of [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
(1024, 'deepseek', CompletionMode.NORMAL, "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
(1024, 'deepseek', CompletionMode.STREAMED, None, "^<think>First, I [\\s\\S]*?</think>To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
(128, 'deepseek', None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(128, None, None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(1024, 'deepseek', "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
(1024, 'deepseek', "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
# (1024, 'none', CompletionMode.NORMAL, None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
# (128, 'deepseek', None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*", "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M", None),
])

View file

@ -308,10 +308,12 @@ class ServerProcess:
stream = data.get('stream', False)
if stream:
content: list[str] = []
reasoning_content: list[str] = []
tool_calls: list[dict] = []
finish_reason: Optional[str] = None
content_parts = 0
reasoning_content_parts = 0
tool_call_parts = 0
arguments_parts = 0
@ -322,6 +324,10 @@ class ServerProcess:
assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
content.append(choice['delta']['content'])
content_parts += 1
if choice['delta'].get('reasoning_content') is not None:
assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
reasoning_content.append(choice['delta']['reasoning_content'])
reasoning_content_parts += 1
if choice['delta'].get('finish_reason') is not None:
finish_reason = choice['delta']['finish_reason']
for tc in choice['delta'].get('tool_calls', []):
@ -349,8 +355,10 @@ class ServerProcess:
tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
if fct.get('arguments') is not None:
tool_call['function']['arguments'] += fct['arguments']
arguments_parts += 1
tool_call_parts += 1
print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
result = dict(
choices=[
dict(
@ -359,6 +367,7 @@ class ServerProcess:
message=dict(
role='assistant',
content=''.join(content) if content else None,
reasoning_content=''.join(reasoning_content) if reasoning_content else None,
tool_calls=tool_calls if tool_calls else None,
),
)