Merge branch 'upstream' into concedo_experimental

2026-05-19 08:00:25 +00:00 · 2025-10-11 23:19:38 +08:00 · 2025-10-11 23:19:38 +08:00 · 720fc30832
commit 720fc30832
parent e92f9fd422 a3cb04744f
12 changed files with 149 additions and 98 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -5971,20 +5971,12 @@ class Mamba2Model(TextModel):
 class JambaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.JAMBA

-    def get_vocab_base_pre(self, tokenizer) -> str:
-        del tokenizer  # unused
-
-        return "gpt-2"
-
    def set_vocab(self):
        if (self.dir_model / "tokenizer.model").is_file():
-            # Using Jamba's tokenizer.json causes errors on model load
-            # (something about "byte not found in vocab"),
-            # but there's a working tokenizer.model
            self._set_vocab_sentencepiece()
        else:
-            # Some Jamba models only have a tokenizer.json, which works.
-            self._set_vocab_gpt2()
+            self._set_vocab_llama_hf()
+            self.gguf_writer.add_add_space_prefix(False)

    def set_gguf_parameters(self):
        d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -3881,7 +3881,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->device = i;
                dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);

-                ggml_cuda_set_device(i);
                cudaDeviceProp prop;
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@ -1546,9 +1546,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        !ggml_is_transposed(op->src[1]) &&
        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-        props_dev->has_simdgroup_mm && ne00 >= 64 &&
-        (ne11 > ne11_mm_min || (ggml_is_quantized(op->src[0]->type) && ne12 > 1))) {
-        //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+        props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
+        //GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);

        // some Metal matrix data types require aligned pointers
        // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -7487,7 +7487,7 @@ kernel void kernel_mul_mv_iq1_m_f32(
    kernel_mul_mv_iq1_m_f32_impl<N_R0_IQ1_M, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }

-template<int nr0, typename args_t>
+template<int NR0, typename args_t>
 void kernel_mul_mv_iq4_nl_f32_impl(
        args_t args,
        device const char * src0,
@ -7500,13 +7500,12 @@ void kernel_mul_mv_iq4_nl_f32_impl(
    const short NSG = FC_mul_mv_nsg;

    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
-    const int nb = args.ne00/QK4_NL;

    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const int first_row = (r0 * NSG + sgitg) * nr0;
+    const int first_row = (r0 * NSG + sgitg) * NR0;

    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;
@ -7517,6 +7516,9 @@ void kernel_mul_mv_iq4_nl_f32_impl(
    device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
    device const float        * y = (device const float        *) (src1 + offset1);

+    const int nb   = args.ne00/QK4_NL;
+    const int ns01 = args.nb01/args.nb00;
+
    const short ix = tiisg/2;  // 0...15
    const short it = tiisg%2;  // 0 or 1

@ -7524,24 +7526,25 @@ void kernel_mul_mv_iq4_nl_f32_impl(
    threadgroup_barrier(mem_flags::mem_threadgroup);

    float4 yl[4];
-    float sumf[nr0]={0.f};
+    float sumf[NR0]={0.f};

-    device const float * yb = y + ix * QK4_NL + it * 8;
+    device const float * yb = y + ix*QK4_NL + it*8;

    uint32_t aux32[2];
    thread const uint8_t * q8 = (thread const uint8_t *)aux32;

    float4 qf1, qf2;

-    for (int ib = ix; ib < nb; ib += 16) {
+    // [TAG_MUL_MV_WEIRD]
+    for (int ib = ix; ib < nb && ib < ns01; ib += 16) {
        device const float4 * y4 = (device const float4 *)yb;
        yl[0] = y4[0];
        yl[1] = y4[4];
        yl[2] = y4[1];
        yl[3] = y4[5];

-        for (short row = 0; row < nr0; row++) {
-            device const block_iq4_nl & xb = x[row*nb + ib];
+        for (short row = 0; row < NR0; row++) {
+            device const block_iq4_nl & xb = x[row*ns01 + ib];
            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);

            float4 acc1 = {0.f}, acc2 = {0.f};
@ -7572,7 +7575,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
+    for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
        float sum_all = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = sum_all;
@ -7594,7 +7597,7 @@ kernel void kernel_mul_mv_iq4_nl_f32(
    kernel_mul_mv_iq4_nl_f32_impl<N_R0_IQ4_NL, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }

-template<int nr0, typename args_t>
+template<int NR0, typename args_t>
 void kernel_mul_mv_iq4_xs_f32_impl(
        args_t args,
        device const char * src0,
@ -7607,12 +7610,11 @@ void kernel_mul_mv_iq4_xs_f32_impl(
    const short NSG = FC_mul_mv_nsg;

    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
-    const int nb = args.ne00/QK_K;

    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;
-    const int first_row = (r0 * NSG + sgitg) * nr0;
+    const int first_row = (r0 * NSG + sgitg) * NR0;

    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;
@ -7623,6 +7625,9 @@ void kernel_mul_mv_iq4_xs_f32_impl(
    device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
    device const float        * y = (device const float        *) (src1 + offset1);

+    const int nb   = args.ne00/QK_K;
+    const int ns01 = args.nb01/args.nb00;
+
    const short ix = tiisg/16;  // 0 or 1
    const short it = tiisg%16;  // 0...15
    const short ib = it/2;
@ -7632,7 +7637,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
    threadgroup_barrier(mem_flags::mem_threadgroup);

    float4 yl[4];
-    float sumf[nr0]={0.f};
+    float sumf[NR0]={0.f};

    device const float * yb = y + ix * QK_K + ib * 32 + il * 8;

@ -7641,15 +7646,16 @@ void kernel_mul_mv_iq4_xs_f32_impl(

    float4 qf1, qf2;

-    for (int ibl = ix; ibl < nb; ibl += 2) {
+    // [TAG_MUL_MV_WEIRD]
+    for (int ibl = ix; ibl < nb && ibl < ns01; ibl += 2) {
        device const float4 * y4 = (device const float4 *)yb;
        yl[0] = y4[0];
        yl[1] = y4[4];
        yl[2] = y4[1];
        yl[3] = y4[5];

-        for (short row = 0; row < nr0; ++row) {
-            device const block_iq4_xs & xb = x[row*nb + ibl];
+        for (short row = 0; row < NR0; ++row) {
+            device const block_iq4_xs & xb = x[row*ns01 + ibl];
            device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);

            float4 acc1 = {0.f}, acc2 = {0.f};
@ -7679,7 +7685,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
+    for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
        float sum_all = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = sum_all;
@ -7701,7 +7707,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
    kernel_mul_mv_iq4_xs_f32_impl<N_R0_IQ4_XS, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
 }

-template<int nr0, typename args_t>
+template<int NR0, typename args_t>
 void kernel_mul_mv_mxfp4_f32_impl(
        args_t args,
        device const char * src0,
@ -7714,13 +7720,12 @@ void kernel_mul_mv_mxfp4_f32_impl(
    const short NSG = FC_mul_mv_nsg;

    threadgroup float * shmem_f32 = (threadgroup float *) shmem;
-    const int nb = args.ne00/QK_MXFP4;

    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;

-    const int first_row = (r0 * NSG + sgitg) * nr0;
+    const int first_row = (r0 * NSG + sgitg) * NR0;

    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;
@ -7731,6 +7736,9 @@ void kernel_mul_mv_mxfp4_f32_impl(
    device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0);
    device const float       * y = (device const float       *) (src1 + offset1);

+    const int nb   = args.ne00/QK_MXFP4;
+    const int ns01 = args.nb01/args.nb00; // this can be larger than nb for permuted src0 tensors
+
    const short ix = tiisg/2;  // 0...15
    const short it = tiisg%2;  // 0 or 1

@ -7738,20 +7746,22 @@ void kernel_mul_mv_mxfp4_f32_impl(
    threadgroup_barrier(mem_flags::mem_threadgroup);

    float4 yl[4];
-    float sumf[nr0]={0.f};
+    float sumf[NR0]={0.f};

-    device const float * yb = y + ix * QK_MXFP4 + it * 8;
+    device const float * yb = y + ix*QK_MXFP4 + it*8;
+
+    // note: just the check `ib < nb` is enough, but adding the redundant `&& ib < ns01` check makes the kernel a bit faster
+    //       no idea why that is - needs some deeper investigation [TAG_MUL_MV_WEIRD]
+    for (int ib = ix; ib < nb && ib < ns01; ib += 16) {
+        device const float4 * y4 = (device const float4 *) yb;

-    for (int ib = ix; ib < nb; ib += 16) {
-        device const float4 * y4 = (device const float4 *)yb;
        yl[0] = y4[0];
        yl[1] = y4[4];
        yl[2] = y4[1];
        yl[3] = y4[5];

-#pragma unroll(nr0)
-        for (short row = 0; row < nr0; row++) {
-            device const block_mxfp4 & xb = x[row*nb + ib];
+        FOR_UNROLL (short row = 0; row < NR0; row++) {
+            device const block_mxfp4 & xb = x[row*ns01 + ib];
            device const uint8_t     * q2 = (device const uint8_t *)(xb.qs + 8*it);

            float4 acc1 = yl[0]*float4(shmem_f32[q2[0] &  0x0F], shmem_f32[q2[1] &  0x0F], shmem_f32[q2[2] &  0x0F], shmem_f32[q2[3] &  0x0F]);
@ -7769,7 +7779,7 @@ void kernel_mul_mv_mxfp4_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
+    for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
        float sum_all = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = sum_all;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -16375,10 +16375,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
    }

    ggml_tensor * build_layer_ffn(
-              ggml_tensor       * cur,
-              ggml_tensor       * inpSA,
-        const llama_model       & model,
-        const int                 il) {
+              ggml_tensor * cur,
+              ggml_tensor * inpSA,
+        const llama_model & model,
+        const int           il) {

        // For Granite architectures - scale residual
        if (hparams.f_residual_scale) {
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -5401,15 +5401,6 @@ int main(int argc, char ** argv) {

        const json body = json::parse(req.body);

-        // TODO: implement
-        //int top_n = 1;
-        //if (body.count("top_n") != 1) {
-        //    top_n = body.at("top_n");
-        //} else {
-        //    res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST));
-        //    return;
-        //}
-
        // if true, use TEI API format, otherwise use Jina API format
        // Jina: https://jina.ai/reranker/
        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
@ -5434,6 +5425,8 @@ int main(int argc, char ** argv) {
            return;
        }

+        int top_n = json_value(body, "top_n", (int)documents.size());
+
        // create and queue the task
        json responses = json::array();
        bool error = false;
@ -5474,7 +5467,8 @@ int main(int argc, char ** argv) {
            body,
            responses,
            is_tei_format,
-            documents);
+            documents,
+            top_n);

        res_ok(res, root);
    };
--- a/tools/server/tests/unit/test_rerank.py
+++ b/tools/server/tests/unit/test_rerank.py
@ -102,3 +102,45 @@ def test_rerank_usage(query, doc1, doc2, n_tokens):
    assert res.status_code == 200
    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
    assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+@pytest.mark.parametrize("top_n,expected_len", [
+    (None, len(TEST_DOCUMENTS)),  # no top_n parameter
+    (2, 2),
+    (4, 4),
+    (99, len(TEST_DOCUMENTS)),    # higher than available docs
+])
+def test_rerank_top_n(top_n, expected_len):
+    global server
+    server.start()
+    data = {
+        "query": "Machine learning is",
+        "documents": TEST_DOCUMENTS,
+    }
+    if top_n is not None:
+        data["top_n"] = top_n
+
+    res = server.make_request("POST", "/rerank", data=data)
+    assert res.status_code == 200
+    assert len(res.body["results"]) == expected_len
+
+
+@pytest.mark.parametrize("top_n,expected_len", [
+    (None, len(TEST_DOCUMENTS)),  # no top_n parameter
+    (2, 2),
+    (4, 4),
+    (99, len(TEST_DOCUMENTS)),    # higher than available docs
+])
+def test_rerank_tei_top_n(top_n, expected_len):
+    global server
+    server.start()
+    data = {
+        "query": "Machine learning is",
+        "texts": TEST_DOCUMENTS,
+    }
+    if top_n is not None:
+        data["top_n"] = top_n
+
+    res = server.make_request("POST", "/rerank", data=data)
+    assert res.status_code == 200
+    assert len(res.body) == expected_len
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -849,47 +849,44 @@ static json format_response_rerank(
        const json & request,
        const json & ranks,
        bool is_tei_format,
-        std::vector<std::string> & texts) {
-    json res;
-    if (is_tei_format) {
-        // TEI response format
-        res = json::array();
-        bool return_text = json_value(request, "return_text", false);
-        for (const auto & rank : ranks) {
-            int index = json_value(rank, "index", 0);
-            json elem = json{
-                {"index", index},
-                {"score", json_value(rank, "score", 0.0)},
-            };
-            if (return_text) {
-                elem["text"] = std::move(texts[index]);
-            }
-            res.push_back(elem);
-        }
-    } else {
-        // Jina response format
-        json results = json::array();
-        int32_t n_tokens = 0;
-        for (const auto & rank : ranks) {
-            results.push_back(json{
-                {"index",           json_value(rank, "index", 0)},
-                {"relevance_score", json_value(rank, "score", 0.0)},
-            });
-
-            n_tokens += json_value(rank, "tokens_evaluated", 0);
-        }
-
-        res = json{
-            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-            {"object", "list"},
-            {"usage", json{
-                {"prompt_tokens", n_tokens},
-                {"total_tokens", n_tokens}
-            }},
-            {"results", results}
+        std::vector<std::string> & texts,
+        int top_n) {
+    int32_t n_tokens = 0;
+    bool return_text = is_tei_format && json_value(request, "return_text", false);
+    std::vector<json> elements; // Temporary vector to hold unsorted elements
+    std::string score_label = is_tei_format ? "score" : "relevance_score";
+    for (const auto & rank : ranks) {
+        int index = json_value(rank, "index", 0);
+        json elem = json{
+            {"index", index},
+            {score_label, json_value(rank, "score", 0.0)},
        };
+        n_tokens += json_value(rank, "tokens_evaluated", 0);
+        if (return_text) {
+            elem["text"] = std::move(texts[index]);
+        }
+        elements.push_back(elem);
    }

+    std::sort(elements.begin(), elements.end(), [score_label](const json& a, const json& b) {
+        return json_value(a, score_label, 0.0) > json_value(b, score_label, 0.0);
+    });
+
+    elements.resize(std::min(top_n, (int)elements.size()));
+    json results = elements;
+
+    if (is_tei_format) return results;
+
+    json res = json{
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json{
+            {"prompt_tokens", n_tokens},
+            {"total_tokens", n_tokens}
+        }},
+        {"results", results}
+    };
+
    return res;
 }

--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
@ -2,8 +2,9 @@
 	import { Check, X } from '@lucide/svelte';
 	import { Card } from '$lib/components/ui/card';
 	import { Button } from '$lib/components/ui/button';
-	import { ChatAttachmentsList } from '$lib/components/app';
+	import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { config } from '$lib/stores/settings.svelte';
 	import ChatMessageActions from './ChatMessageActions.svelte';

 	interface Props {
@ -55,6 +56,7 @@

 	let isMultiline = $state(false);
 	let messageElement: HTMLElement | undefined = $state();
+	const currentConfig = config();

 	$effect(() => {
 		if (!messageElement || !message.content.trim()) return;
@ -123,9 +125,18 @@
 				class="max-w-[80%] rounded-[1.125rem] bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5"
 				data-multiline={isMultiline ? '' : undefined}
 			>
-				<span bind:this={messageElement} class="text-md whitespace-pre-wrap">
-					{message.content}
-				</span>
+				{#if currentConfig.renderUserContentAsMarkdown}
+					<div bind:this={messageElement} class="text-md">
+						<MarkdownContent
+							class="markdown-user-content text-primary-foreground"
+							content={message.content}
+						/>
+					</div>
+				{:else}
+					<span bind:this={messageElement} class="text-md whitespace-pre-wrap">
+						{message.content}
+					</span>
+				{/if}
 			</Card>
 		{/if}

--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte
@ -80,6 +80,11 @@
 					key: 'showModelInfo',
 					label: 'Show model information',
 					type: 'checkbox'
+				},
+				{
+					key: 'renderUserContentAsMarkdown',
+					label: 'Render user content as Markdown',
+					type: 'checkbox'
 				}
 			]
 		},
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@ -12,6 +12,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
 	pasteLongTextToFileLen: 2500,
 	pdfAsImage: false,
 	showModelInfo: false,
+	renderUserContentAsMarkdown: false,
 	// make sure these default values are in sync with `common.h`
 	samplers: 'top_k;typ_p;top_p;min_p;temperature',
 	temperature: 0.8,
@ -84,6 +85,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 		'Ask for confirmation before automatically changing conversation title when editing the first message.',
 	pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
 	showModelInfo: 'Display the model name used to generate each message below the message content.',
+	renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
 	pyInterpreterEnabled:
 		'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.'
 };