diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b11eb8e35..2fbd020b9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5971,20 +5971,12 @@ class Mamba2Model(TextModel): class JambaModel(TextModel): model_arch = gguf.MODEL_ARCH.JAMBA - def get_vocab_base_pre(self, tokenizer) -> str: - del tokenizer # unused - - return "gpt-2" - def set_vocab(self): if (self.dir_model / "tokenizer.model").is_file(): - # Using Jamba's tokenizer.json causes errors on model load - # (something about "byte not found in vocab"), - # but there's a working tokenizer.model self._set_vocab_sentencepiece() else: - # Some Jamba models only have a tokenizer.json, which works. - self._set_vocab_gpt2() + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5c5d00514..f4f9bce10 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3881,7 +3881,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->device = i; dev_ctx->name = GGML_CUDA_NAME + std::to_string(i); - ggml_cuda_set_device(i); cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); dev_ctx->description = prop.name; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 1137e2107..5f9370449 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -1546,9 +1546,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) { !ggml_is_transposed(op->src[1]) && // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - props_dev->has_simdgroup_mm && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(op->src[0]->type) && ne12 > 1))) { - //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) { + //GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); // some Metal matrix data types require aligned pointers // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 45d91def8..ddc285042 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -7487,7 +7487,7 @@ kernel void kernel_mul_mv_iq1_m_f32( kernel_mul_mv_iq1_m_f32_impl(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); } -template +template void kernel_mul_mv_iq4_nl_f32_impl( args_t args, device const char * src0, @@ -7500,13 +7500,12 @@ void kernel_mul_mv_iq4_nl_f32_impl( const short NSG = FC_mul_mv_nsg; threadgroup float * shmem_f32 = (threadgroup float *) shmem; - const int nb = args.ne00/QK4_NL; const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; - const int first_row = (r0 * NSG + sgitg) * nr0; + const int first_row = (r0 * NSG + sgitg) * NR0; const uint i12 = im%args.ne12; const uint i13 = im/args.ne12; @@ -7517,6 +7516,9 @@ void kernel_mul_mv_iq4_nl_f32_impl( device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0); device const float * y = (device const float *) (src1 + offset1); + const int nb = args.ne00/QK4_NL; + const int ns01 = args.nb01/args.nb00; + const short ix = tiisg/2; // 0...15 const short it = tiisg%2; // 0 or 1 @@ -7524,24 +7526,25 @@ void kernel_mul_mv_iq4_nl_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); float4 yl[4]; - float sumf[nr0]={0.f}; + float sumf[NR0]={0.f}; - device const float * yb = y + ix * QK4_NL + it * 8; + device const float * yb = y + ix*QK4_NL + it*8; uint32_t aux32[2]; thread const uint8_t * q8 = (thread const uint8_t *)aux32; float4 qf1, qf2; - for (int ib = ix; ib < nb; ib += 16) { + // [TAG_MUL_MV_WEIRD] + for (int ib = ix; ib < nb && ib < ns01; ib += 16) { device const float4 * y4 = (device const float4 *)yb; yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; - for (short row = 0; row < nr0; row++) { - device const block_iq4_nl & xb = x[row*nb + ib]; + for (short row = 0; row < NR0; row++) { + device const block_iq4_nl & xb = x[row*ns01 + ib]; device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it); float4 acc1 = {0.f}, acc2 = {0.f}; @@ -7572,7 +7575,7 @@ void kernel_mul_mv_iq4_nl_f32_impl( device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { float sum_all = simd_sum(sumf[row]); if (tiisg == 0) { dst_f32[first_row + row] = sum_all; @@ -7594,7 +7597,7 @@ kernel void kernel_mul_mv_iq4_nl_f32( kernel_mul_mv_iq4_nl_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); } -template +template void kernel_mul_mv_iq4_xs_f32_impl( args_t args, device const char * src0, @@ -7607,12 +7610,11 @@ void kernel_mul_mv_iq4_xs_f32_impl( const short NSG = FC_mul_mv_nsg; threadgroup float * shmem_f32 = (threadgroup float *) shmem; - const int nb = args.ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; - const int first_row = (r0 * NSG + sgitg) * nr0; + const int first_row = (r0 * NSG + sgitg) * NR0; const uint i12 = im%args.ne12; const uint i13 = im/args.ne12; @@ -7623,6 +7625,9 @@ void kernel_mul_mv_iq4_xs_f32_impl( device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0); device const float * y = (device const float *) (src1 + offset1); + const int nb = args.ne00/QK_K; + const int ns01 = args.nb01/args.nb00; + const short ix = tiisg/16; // 0 or 1 const short it = tiisg%16; // 0...15 const short ib = it/2; @@ -7632,7 +7637,7 @@ void kernel_mul_mv_iq4_xs_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); float4 yl[4]; - float sumf[nr0]={0.f}; + float sumf[NR0]={0.f}; device const float * yb = y + ix * QK_K + ib * 32 + il * 8; @@ -7641,15 +7646,16 @@ void kernel_mul_mv_iq4_xs_f32_impl( float4 qf1, qf2; - for (int ibl = ix; ibl < nb; ibl += 2) { + // [TAG_MUL_MV_WEIRD] + for (int ibl = ix; ibl < nb && ibl < ns01; ibl += 2) { device const float4 * y4 = (device const float4 *)yb; yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; - for (short row = 0; row < nr0; ++row) { - device const block_iq4_xs & xb = x[row*nb + ibl]; + for (short row = 0; row < NR0; ++row) { + device const block_iq4_xs & xb = x[row*ns01 + ibl]; device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il); float4 acc1 = {0.f}, acc2 = {0.f}; @@ -7679,7 +7685,7 @@ void kernel_mul_mv_iq4_xs_f32_impl( device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { float sum_all = simd_sum(sumf[row]); if (tiisg == 0) { dst_f32[first_row + row] = sum_all; @@ -7701,7 +7707,7 @@ kernel void kernel_mul_mv_iq4_xs_f32( kernel_mul_mv_iq4_xs_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg); } -template +template void kernel_mul_mv_mxfp4_f32_impl( args_t args, device const char * src0, @@ -7714,13 +7720,12 @@ void kernel_mul_mv_mxfp4_f32_impl( const short NSG = FC_mul_mv_nsg; threadgroup float * shmem_f32 = (threadgroup float *) shmem; - const int nb = args.ne00/QK_MXFP4; const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; - const int first_row = (r0 * NSG + sgitg) * nr0; + const int first_row = (r0 * NSG + sgitg) * NR0; const uint i12 = im%args.ne12; const uint i13 = im/args.ne12; @@ -7731,6 +7736,9 @@ void kernel_mul_mv_mxfp4_f32_impl( device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0); device const float * y = (device const float *) (src1 + offset1); + const int nb = args.ne00/QK_MXFP4; + const int ns01 = args.nb01/args.nb00; // this can be larger than nb for permuted src0 tensors + const short ix = tiisg/2; // 0...15 const short it = tiisg%2; // 0 or 1 @@ -7738,20 +7746,22 @@ void kernel_mul_mv_mxfp4_f32_impl( threadgroup_barrier(mem_flags::mem_threadgroup); float4 yl[4]; - float sumf[nr0]={0.f}; + float sumf[NR0]={0.f}; - device const float * yb = y + ix * QK_MXFP4 + it * 8; + device const float * yb = y + ix*QK_MXFP4 + it*8; + + // note: just the check `ib < nb` is enough, but adding the redundant `&& ib < ns01` check makes the kernel a bit faster + // no idea why that is - needs some deeper investigation [TAG_MUL_MV_WEIRD] + for (int ib = ix; ib < nb && ib < ns01; ib += 16) { + device const float4 * y4 = (device const float4 *) yb; - for (int ib = ix; ib < nb; ib += 16) { - device const float4 * y4 = (device const float4 *)yb; yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5]; -#pragma unroll(nr0) - for (short row = 0; row < nr0; row++) { - device const block_mxfp4 & xb = x[row*nb + ib]; + FOR_UNROLL (short row = 0; row < NR0; row++) { + device const block_mxfp4 & xb = x[row*ns01 + ib]; device const uint8_t * q2 = (device const uint8_t *)(xb.qs + 8*it); float4 acc1 = yl[0]*float4(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]); @@ -7769,7 +7779,7 @@ void kernel_mul_mv_mxfp4_f32_impl( device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0; - for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) { + for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) { float sum_all = simd_sum(sumf[row]); if (tiisg == 0) { dst_f32[first_row + row] = sum_all; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8357e3d81..64eb3c6d3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -16375,10 +16375,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { } ggml_tensor * build_layer_ffn( - ggml_tensor * cur, - ggml_tensor * inpSA, - const llama_model & model, - const int il) { + ggml_tensor * cur, + ggml_tensor * inpSA, + const llama_model & model, + const int il) { // For Granite architectures - scale residual if (hparams.f_residual_scale) { diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 5026edceb..d0c44534e 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 60326e8e5..cf12805b4 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5401,15 +5401,6 @@ int main(int argc, char ** argv) { const json body = json::parse(req.body); - // TODO: implement - //int top_n = 1; - //if (body.count("top_n") != 1) { - // top_n = body.at("top_n"); - //} else { - // res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST)); - // return; - //} - // if true, use TEI API format, otherwise use Jina API format // Jina: https://jina.ai/reranker/ // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank @@ -5434,6 +5425,8 @@ int main(int argc, char ** argv) { return; } + int top_n = json_value(body, "top_n", (int)documents.size()); + // create and queue the task json responses = json::array(); bool error = false; @@ -5474,7 +5467,8 @@ int main(int argc, char ** argv) { body, responses, is_tei_format, - documents); + documents, + top_n); res_ok(res, root); }; diff --git a/tools/server/tests/unit/test_rerank.py b/tools/server/tests/unit/test_rerank.py index 0b63c7821..ded826710 100644 --- a/tools/server/tests/unit/test_rerank.py +++ b/tools/server/tests/unit/test_rerank.py @@ -102,3 +102,45 @@ def test_rerank_usage(query, doc1, doc2, n_tokens): assert res.status_code == 200 assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] assert res.body['usage']['prompt_tokens'] == n_tokens + + +@pytest.mark.parametrize("top_n,expected_len", [ + (None, len(TEST_DOCUMENTS)), # no top_n parameter + (2, 2), + (4, 4), + (99, len(TEST_DOCUMENTS)), # higher than available docs +]) +def test_rerank_top_n(top_n, expected_len): + global server + server.start() + data = { + "query": "Machine learning is", + "documents": TEST_DOCUMENTS, + } + if top_n is not None: + data["top_n"] = top_n + + res = server.make_request("POST", "/rerank", data=data) + assert res.status_code == 200 + assert len(res.body["results"]) == expected_len + + +@pytest.mark.parametrize("top_n,expected_len", [ + (None, len(TEST_DOCUMENTS)), # no top_n parameter + (2, 2), + (4, 4), + (99, len(TEST_DOCUMENTS)), # higher than available docs +]) +def test_rerank_tei_top_n(top_n, expected_len): + global server + server.start() + data = { + "query": "Machine learning is", + "texts": TEST_DOCUMENTS, + } + if top_n is not None: + data["top_n"] = top_n + + res = server.make_request("POST", "/rerank", data=data) + assert res.status_code == 200 + assert len(res.body) == expected_len diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f175115f4..fd0bc8de5 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -849,47 +849,44 @@ static json format_response_rerank( const json & request, const json & ranks, bool is_tei_format, - std::vector & texts) { - json res; - if (is_tei_format) { - // TEI response format - res = json::array(); - bool return_text = json_value(request, "return_text", false); - for (const auto & rank : ranks) { - int index = json_value(rank, "index", 0); - json elem = json{ - {"index", index}, - {"score", json_value(rank, "score", 0.0)}, - }; - if (return_text) { - elem["text"] = std::move(texts[index]); - } - res.push_back(elem); - } - } else { - // Jina response format - json results = json::array(); - int32_t n_tokens = 0; - for (const auto & rank : ranks) { - results.push_back(json{ - {"index", json_value(rank, "index", 0)}, - {"relevance_score", json_value(rank, "score", 0.0)}, - }); - - n_tokens += json_value(rank, "tokens_evaluated", 0); - } - - res = json{ - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", "list"}, - {"usage", json{ - {"prompt_tokens", n_tokens}, - {"total_tokens", n_tokens} - }}, - {"results", results} + std::vector & texts, + int top_n) { + int32_t n_tokens = 0; + bool return_text = is_tei_format && json_value(request, "return_text", false); + std::vector elements; // Temporary vector to hold unsorted elements + std::string score_label = is_tei_format ? "score" : "relevance_score"; + for (const auto & rank : ranks) { + int index = json_value(rank, "index", 0); + json elem = json{ + {"index", index}, + {score_label, json_value(rank, "score", 0.0)}, }; + n_tokens += json_value(rank, "tokens_evaluated", 0); + if (return_text) { + elem["text"] = std::move(texts[index]); + } + elements.push_back(elem); } + std::sort(elements.begin(), elements.end(), [score_label](const json& a, const json& b) { + return json_value(a, score_label, 0.0) > json_value(b, score_label, 0.0); + }); + + elements.resize(std::min(top_n, (int)elements.size())); + json results = elements; + + if (is_tei_format) return results; + + json res = json{ + {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", "list"}, + {"usage", json{ + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} + }}, + {"results", results} + }; + return res; } diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte index 66369b2f1..cc2631b83 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte @@ -2,8 +2,9 @@ import { Check, X } from '@lucide/svelte'; import { Card } from '$lib/components/ui/card'; import { Button } from '$lib/components/ui/button'; - import { ChatAttachmentsList } from '$lib/components/app'; + import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app'; import { INPUT_CLASSES } from '$lib/constants/input-classes'; + import { config } from '$lib/stores/settings.svelte'; import ChatMessageActions from './ChatMessageActions.svelte'; interface Props { @@ -55,6 +56,7 @@ let isMultiline = $state(false); let messageElement: HTMLElement | undefined = $state(); + const currentConfig = config(); $effect(() => { if (!messageElement || !message.content.trim()) return; @@ -123,9 +125,18 @@ class="max-w-[80%] rounded-[1.125rem] bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5" data-multiline={isMultiline ? '' : undefined} > - - {message.content} - + {#if currentConfig.renderUserContentAsMarkdown} +
+ +
+ {:else} + + {message.content} + + {/if} {/if} diff --git a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte index 7c25e5925..dc617afdc 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsDialog.svelte @@ -80,6 +80,11 @@ key: 'showModelInfo', label: 'Show model information', type: 'checkbox' + }, + { + key: 'renderUserContentAsMarkdown', + label: 'Render user content as Markdown', + type: 'checkbox' } ] }, diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts index 63e4364ae..154ec888c 100644 --- a/tools/server/webui/src/lib/constants/settings-config.ts +++ b/tools/server/webui/src/lib/constants/settings-config.ts @@ -12,6 +12,7 @@ export const SETTING_CONFIG_DEFAULT: Record = pasteLongTextToFileLen: 2500, pdfAsImage: false, showModelInfo: false, + renderUserContentAsMarkdown: false, // make sure these default values are in sync with `common.h` samplers: 'top_k;typ_p;top_p;min_p;temperature', temperature: 0.8, @@ -84,6 +85,7 @@ export const SETTING_CONFIG_INFO: Record = { 'Ask for confirmation before automatically changing conversation title when editing the first message.', pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).', showModelInfo: 'Display the model name used to generate each message below the message content.', + renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.', pyInterpreterEnabled: 'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.' };