Merge branch 'upstream' into concedo_experimental

# Conflicts: # README.md # docs/build.md # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/kleidiai/kernels.cpp # ggml/src/ggml-cpu/kleidiai/kernels.h # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # tests/test-backend-ops.cpp # tools/server/README.md
2026-05-21 18:52:02 +00:00 · 2025-07-21 23:37:42 +08:00 · 2025-07-21 23:37:42 +08:00 · 4abea4b5c9
commit 4abea4b5c9
parent f5aa7c2265 922042601b
4 changed files with 15 additions and 13 deletions
--- a/ggml/src/ggml-cuda/im2col.cu
+++ b/ggml/src/ggml-cuda/im2col.cu
@ -10,7 +10,7 @@ static  __global__ void im2col_kernel(
        return;
    }

-    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
+    const int64_t  ksize = OW * KH;
    const int64_t  kx = i / ksize;
    const int64_t  kd = kx * ksize;
    const int64_t  ky = (i - kd) / OW;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@ -40,12 +40,10 @@ void main() {
    const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
    const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
    const int oh_s1 = int(oh) * p.s1;
-    const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+    const uint ksize = p.OW * p.KH;

    const uint base_linear_idx = gidx * NUM_ITER;

-    const uint max_ky = ksize / p.OW;
-
    uint current_kx = base_linear_idx / ksize;
    const uint rem = base_linear_idx - (current_kx * ksize);
    uint current_ky = rem / p.OW;
@ -76,7 +74,7 @@ void main() {

        if (++current_ix == p.OW) {
            current_ix = 0;
-            if (++current_ky == max_ky) {
+            if (++current_ky == p.KH) {
                current_ky = 0;
                current_kx++;
            }
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@ -786,14 +786,17 @@ int main(int argc, char ** argv) {
                }

                // check for reverse prompt using special tokens
-                llama_token last_token = common_sampler_last(smpl);
-                for (auto token : antiprompt_token) {
-                    if (token == last_token) {
-                        if (params.interactive) {
-                            is_interacting = true;
+                // avoid calling common_sampler_last() if last_output is empty
+                if (!last_output.empty()) {
+                    llama_token last_token = common_sampler_last(smpl);
+                    for (auto token : antiprompt_token) {
+                        if (token == last_token) {
+                            if (params.interactive) {
+                                is_interacting = true;
+                            }
+                            is_antiprompt = true;
+                            break;
                        }
-                        is_antiprompt = true;
-                        break;
                    }
                }

--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -4516,9 +4516,10 @@ int main(int argc, char ** argv) {
        json tokens_response = json::array();
        if (body.count("content") != 0) {
            const bool add_special = json_value(body, "add_special", false);
+            const bool parse_special = json_value(body, "parse_special", true);
            const bool with_pieces = json_value(body, "with_pieces", false);

-            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);

            if (with_pieces) {
                for (const auto& token : tokens) {