Merge branch 'upstream' into concedo_experimental

This commit is contained in:
Concedo 2025-10-11 23:19:38 +08:00
commit 720fc30832
12 changed files with 149 additions and 98 deletions

View file

@ -5971,20 +5971,12 @@ class Mamba2Model(TextModel):
class JambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.JAMBA
def get_vocab_base_pre(self, tokenizer) -> str:
del tokenizer # unused
return "gpt-2"
def set_vocab(self):
if (self.dir_model / "tokenizer.model").is_file():
# Using Jamba's tokenizer.json causes errors on model load
# (something about "byte not found in vocab"),
# but there's a working tokenizer.model
self._set_vocab_sentencepiece()
else:
# Some Jamba models only have a tokenizer.json, which works.
self._set_vocab_gpt2()
self._set_vocab_llama_hf()
self.gguf_writer.add_add_space_prefix(False)
def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])

View file

@ -3881,7 +3881,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
dev_ctx->device = i;
dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
ggml_cuda_set_device(i);
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;

View file

@ -1546,9 +1546,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
!ggml_is_transposed(op->src[1]) &&
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
props_dev->has_simdgroup_mm && ne00 >= 64 &&
(ne11 > ne11_mm_min || (ggml_is_quantized(op->src[0]->type) && ne12 > 1))) {
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
props_dev->has_simdgroup_mm && ne00 >= 64 && ne11 > ne11_mm_min) {
//GGML_LOG_INFO("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
// some Metal matrix data types require aligned pointers
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)

View file

@ -7487,7 +7487,7 @@ kernel void kernel_mul_mv_iq1_m_f32(
kernel_mul_mv_iq1_m_f32_impl<N_R0_IQ1_M, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
}
template<int nr0, typename args_t>
template<int NR0, typename args_t>
void kernel_mul_mv_iq4_nl_f32_impl(
args_t args,
device const char * src0,
@ -7500,13 +7500,12 @@ void kernel_mul_mv_iq4_nl_f32_impl(
const short NSG = FC_mul_mv_nsg;
threadgroup float * shmem_f32 = (threadgroup float *) shmem;
const int nb = args.ne00/QK4_NL;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * NSG + sgitg) * nr0;
const int first_row = (r0 * NSG + sgitg) * NR0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@ -7517,6 +7516,9 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device const block_iq4_nl * x = (device const block_iq4_nl *) (src0 + offset0);
device const float * y = (device const float *) (src1 + offset1);
const int nb = args.ne00/QK4_NL;
const int ns01 = args.nb01/args.nb00;
const short ix = tiisg/2; // 0...15
const short it = tiisg%2; // 0 or 1
@ -7524,24 +7526,25 @@ void kernel_mul_mv_iq4_nl_f32_impl(
threadgroup_barrier(mem_flags::mem_threadgroup);
float4 yl[4];
float sumf[nr0]={0.f};
float sumf[NR0]={0.f};
device const float * yb = y + ix * QK4_NL + it * 8;
device const float * yb = y + ix*QK4_NL + it*8;
uint32_t aux32[2];
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
float4 qf1, qf2;
for (int ib = ix; ib < nb; ib += 16) {
// [TAG_MUL_MV_WEIRD]
for (int ib = ix; ib < nb && ib < ns01; ib += 16) {
device const float4 * y4 = (device const float4 *)yb;
yl[0] = y4[0];
yl[1] = y4[4];
yl[2] = y4[1];
yl[3] = y4[5];
for (short row = 0; row < nr0; row++) {
device const block_iq4_nl & xb = x[row*nb + ib];
for (short row = 0; row < NR0; row++) {
device const block_iq4_nl & xb = x[row*ns01 + ib];
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
float4 acc1 = {0.f}, acc2 = {0.f};
@ -7572,7 +7575,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
float sum_all = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = sum_all;
@ -7594,7 +7597,7 @@ kernel void kernel_mul_mv_iq4_nl_f32(
kernel_mul_mv_iq4_nl_f32_impl<N_R0_IQ4_NL, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
}
template<int nr0, typename args_t>
template<int NR0, typename args_t>
void kernel_mul_mv_iq4_xs_f32_impl(
args_t args,
device const char * src0,
@ -7607,12 +7610,11 @@ void kernel_mul_mv_iq4_xs_f32_impl(
const short NSG = FC_mul_mv_nsg;
threadgroup float * shmem_f32 = (threadgroup float *) shmem;
const int nb = args.ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * NSG + sgitg) * nr0;
const int first_row = (r0 * NSG + sgitg) * NR0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@ -7623,6 +7625,9 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device const block_iq4_xs * x = (device const block_iq4_xs *) (src0 + offset0);
device const float * y = (device const float *) (src1 + offset1);
const int nb = args.ne00/QK_K;
const int ns01 = args.nb01/args.nb00;
const short ix = tiisg/16; // 0 or 1
const short it = tiisg%16; // 0...15
const short ib = it/2;
@ -7632,7 +7637,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
threadgroup_barrier(mem_flags::mem_threadgroup);
float4 yl[4];
float sumf[nr0]={0.f};
float sumf[NR0]={0.f};
device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
@ -7641,15 +7646,16 @@ void kernel_mul_mv_iq4_xs_f32_impl(
float4 qf1, qf2;
for (int ibl = ix; ibl < nb; ibl += 2) {
// [TAG_MUL_MV_WEIRD]
for (int ibl = ix; ibl < nb && ibl < ns01; ibl += 2) {
device const float4 * y4 = (device const float4 *)yb;
yl[0] = y4[0];
yl[1] = y4[4];
yl[2] = y4[1];
yl[3] = y4[5];
for (short row = 0; row < nr0; ++row) {
device const block_iq4_xs & xb = x[row*nb + ibl];
for (short row = 0; row < NR0; ++row) {
device const block_iq4_xs & xb = x[row*ns01 + ibl];
device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
float4 acc1 = {0.f}, acc2 = {0.f};
@ -7679,7 +7685,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
float sum_all = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = sum_all;
@ -7701,7 +7707,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
kernel_mul_mv_iq4_xs_f32_impl<N_R0_IQ4_XS, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
}
template<int nr0, typename args_t>
template<int NR0, typename args_t>
void kernel_mul_mv_mxfp4_f32_impl(
args_t args,
device const char * src0,
@ -7714,13 +7720,12 @@ void kernel_mul_mv_mxfp4_f32_impl(
const short NSG = FC_mul_mv_nsg;
threadgroup float * shmem_f32 = (threadgroup float *) shmem;
const int nb = args.ne00/QK_MXFP4;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * NSG + sgitg) * nr0;
const int first_row = (r0 * NSG + sgitg) * NR0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@ -7731,6 +7736,9 @@ void kernel_mul_mv_mxfp4_f32_impl(
device const block_mxfp4 * x = (device const block_mxfp4 *) (src0 + offset0);
device const float * y = (device const float *) (src1 + offset1);
const int nb = args.ne00/QK_MXFP4;
const int ns01 = args.nb01/args.nb00; // this can be larger than nb for permuted src0 tensors
const short ix = tiisg/2; // 0...15
const short it = tiisg%2; // 0 or 1
@ -7738,20 +7746,22 @@ void kernel_mul_mv_mxfp4_f32_impl(
threadgroup_barrier(mem_flags::mem_threadgroup);
float4 yl[4];
float sumf[nr0]={0.f};
float sumf[NR0]={0.f};
device const float * yb = y + ix * QK_MXFP4 + it * 8;
device const float * yb = y + ix*QK_MXFP4 + it*8;
// note: just the check `ib < nb` is enough, but adding the redundant `&& ib < ns01` check makes the kernel a bit faster
// no idea why that is - needs some deeper investigation [TAG_MUL_MV_WEIRD]
for (int ib = ix; ib < nb && ib < ns01; ib += 16) {
device const float4 * y4 = (device const float4 *) yb;
for (int ib = ix; ib < nb; ib += 16) {
device const float4 * y4 = (device const float4 *)yb;
yl[0] = y4[0];
yl[1] = y4[4];
yl[2] = y4[1];
yl[3] = y4[5];
#pragma unroll(nr0)
for (short row = 0; row < nr0; row++) {
device const block_mxfp4 & xb = x[row*nb + ib];
FOR_UNROLL (short row = 0; row < NR0; row++) {
device const block_mxfp4 & xb = x[row*ns01 + ib];
device const uint8_t * q2 = (device const uint8_t *)(xb.qs + 8*it);
float4 acc1 = yl[0]*float4(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
@ -7769,7 +7779,7 @@ void kernel_mul_mv_mxfp4_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < NR0 && first_row + row < args.ne0; ++row) {
float sum_all = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = sum_all;

View file

@ -16375,10 +16375,10 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
}
ggml_tensor * build_layer_ffn(
ggml_tensor * cur,
ggml_tensor * inpSA,
const llama_model & model,
const int il) {
ggml_tensor * cur,
ggml_tensor * inpSA,
const llama_model & model,
const int il) {
// For Granite architectures - scale residual
if (hparams.f_residual_scale) {

Binary file not shown.

View file

@ -5401,15 +5401,6 @@ int main(int argc, char ** argv) {
const json body = json::parse(req.body);
// TODO: implement
//int top_n = 1;
//if (body.count("top_n") != 1) {
// top_n = body.at("top_n");
//} else {
// res_error(res, format_error_response("\"top_n\" must be provided", ERROR_TYPE_INVALID_REQUEST));
// return;
//}
// if true, use TEI API format, otherwise use Jina API format
// Jina: https://jina.ai/reranker/
// TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
@ -5434,6 +5425,8 @@ int main(int argc, char ** argv) {
return;
}
int top_n = json_value(body, "top_n", (int)documents.size());
// create and queue the task
json responses = json::array();
bool error = false;
@ -5474,7 +5467,8 @@ int main(int argc, char ** argv) {
body,
responses,
is_tei_format,
documents);
documents,
top_n);
res_ok(res, root);
};

View file

@ -102,3 +102,45 @@ def test_rerank_usage(query, doc1, doc2, n_tokens):
assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == n_tokens
@pytest.mark.parametrize("top_n,expected_len", [
(None, len(TEST_DOCUMENTS)), # no top_n parameter
(2, 2),
(4, 4),
(99, len(TEST_DOCUMENTS)), # higher than available docs
])
def test_rerank_top_n(top_n, expected_len):
global server
server.start()
data = {
"query": "Machine learning is",
"documents": TEST_DOCUMENTS,
}
if top_n is not None:
data["top_n"] = top_n
res = server.make_request("POST", "/rerank", data=data)
assert res.status_code == 200
assert len(res.body["results"]) == expected_len
@pytest.mark.parametrize("top_n,expected_len", [
(None, len(TEST_DOCUMENTS)), # no top_n parameter
(2, 2),
(4, 4),
(99, len(TEST_DOCUMENTS)), # higher than available docs
])
def test_rerank_tei_top_n(top_n, expected_len):
global server
server.start()
data = {
"query": "Machine learning is",
"texts": TEST_DOCUMENTS,
}
if top_n is not None:
data["top_n"] = top_n
res = server.make_request("POST", "/rerank", data=data)
assert res.status_code == 200
assert len(res.body) == expected_len

View file

@ -849,47 +849,44 @@ static json format_response_rerank(
const json & request,
const json & ranks,
bool is_tei_format,
std::vector<std::string> & texts) {
json res;
if (is_tei_format) {
// TEI response format
res = json::array();
bool return_text = json_value(request, "return_text", false);
for (const auto & rank : ranks) {
int index = json_value(rank, "index", 0);
json elem = json{
{"index", index},
{"score", json_value(rank, "score", 0.0)},
};
if (return_text) {
elem["text"] = std::move(texts[index]);
}
res.push_back(elem);
}
} else {
// Jina response format
json results = json::array();
int32_t n_tokens = 0;
for (const auto & rank : ranks) {
results.push_back(json{
{"index", json_value(rank, "index", 0)},
{"relevance_score", json_value(rank, "score", 0.0)},
});
n_tokens += json_value(rank, "tokens_evaluated", 0);
}
res = json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
{"usage", json{
{"prompt_tokens", n_tokens},
{"total_tokens", n_tokens}
}},
{"results", results}
std::vector<std::string> & texts,
int top_n) {
int32_t n_tokens = 0;
bool return_text = is_tei_format && json_value(request, "return_text", false);
std::vector<json> elements; // Temporary vector to hold unsorted elements
std::string score_label = is_tei_format ? "score" : "relevance_score";
for (const auto & rank : ranks) {
int index = json_value(rank, "index", 0);
json elem = json{
{"index", index},
{score_label, json_value(rank, "score", 0.0)},
};
n_tokens += json_value(rank, "tokens_evaluated", 0);
if (return_text) {
elem["text"] = std::move(texts[index]);
}
elements.push_back(elem);
}
std::sort(elements.begin(), elements.end(), [score_label](const json& a, const json& b) {
return json_value(a, score_label, 0.0) > json_value(b, score_label, 0.0);
});
elements.resize(std::min(top_n, (int)elements.size()));
json results = elements;
if (is_tei_format) return results;
json res = json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
{"usage", json{
{"prompt_tokens", n_tokens},
{"total_tokens", n_tokens}
}},
{"results", results}
};
return res;
}

View file

@ -2,8 +2,9 @@
import { Check, X } from '@lucide/svelte';
import { Card } from '$lib/components/ui/card';
import { Button } from '$lib/components/ui/button';
import { ChatAttachmentsList } from '$lib/components/app';
import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
import { INPUT_CLASSES } from '$lib/constants/input-classes';
import { config } from '$lib/stores/settings.svelte';
import ChatMessageActions from './ChatMessageActions.svelte';
interface Props {
@ -55,6 +56,7 @@
let isMultiline = $state(false);
let messageElement: HTMLElement | undefined = $state();
const currentConfig = config();
$effect(() => {
if (!messageElement || !message.content.trim()) return;
@ -123,9 +125,18 @@
class="max-w-[80%] rounded-[1.125rem] bg-primary px-3.75 py-1.5 text-primary-foreground data-[multiline]:py-2.5"
data-multiline={isMultiline ? '' : undefined}
>
<span bind:this={messageElement} class="text-md whitespace-pre-wrap">
{message.content}
</span>
{#if currentConfig.renderUserContentAsMarkdown}
<div bind:this={messageElement} class="text-md">
<MarkdownContent
class="markdown-user-content text-primary-foreground"
content={message.content}
/>
</div>
{:else}
<span bind:this={messageElement} class="text-md whitespace-pre-wrap">
{message.content}
</span>
{/if}
</Card>
{/if}

View file

@ -80,6 +80,11 @@
key: 'showModelInfo',
label: 'Show model information',
type: 'checkbox'
},
{
key: 'renderUserContentAsMarkdown',
label: 'Render user content as Markdown',
type: 'checkbox'
}
]
},

View file

@ -12,6 +12,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
pasteLongTextToFileLen: 2500,
pdfAsImage: false,
showModelInfo: false,
renderUserContentAsMarkdown: false,
// make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature',
temperature: 0.8,
@ -84,6 +85,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
'Ask for confirmation before automatically changing conversation title when editing the first message.',
pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
showModelInfo: 'Display the model name used to generate each message below the message content.',
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
pyInterpreterEnabled:
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.'
};