Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/full-cuda.Dockerfile # .devops/full-rocm.Dockerfile # .devops/llama-cli-cuda.Dockerfile # .devops/llama-cli-rocm.Dockerfile # .devops/llama-cli-vulkan.Dockerfile # .devops/llama-cpp-cuda.srpm.spec # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/docker.yml # CMakeLists.txt # Makefile # README.md # examples/llama.android/llama/src/main/cpp/CMakeLists.txt # flake.lock # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # grammars/README.md # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # tests/test-chat-template.cpp # tests/test-grammar-integration.cpp # tests/test-json-schema-to-grammar.cpp
2025-09-11 01:24:36 +00:00 · 2024-06-30 10:59:42 +08:00 · 2024-06-30 10:59:42 +08:00 · 02f92f6ecc
commit 02f92f6ecc
parent 8421243c6d 72272b83a3
22 changed files with 632 additions and 182 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2021,6 +2021,7 @@ struct server_context {
                        slot.t_start_generation = 0;

                        if (slot.infill) {
+                            const bool add_bos = llama_should_add_bos_token(model);
                            bool suff_rm_leading_spc = true;
                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                params.input_suffix.erase(0, 1);
@ -2036,16 +2037,21 @@ struct server_context {
                            }

                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                            if (add_bos) {
+                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                            }
+                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

                            const llama_token middle_token = llama_token_middle(model);
                            if (middle_token >= 0) {
-                                prefix_tokens.push_back(middle_token);
+                                embd_inp.push_back(middle_token);
                            }

-                            prompt_tokens = prefix_tokens;
+                            prompt_tokens = embd_inp;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }