Merge commit 'd4e0d95cf5' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	common/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-rpc/ggml-rpc.cpp
#	scripts/sync-ggml.last
#	tests/CMakeLists.txt
This commit is contained in:
Concedo 2025-06-14 01:58:53 +08:00
commit 69e4a32ca2
18 changed files with 870 additions and 538 deletions

View file

@ -556,6 +556,9 @@ class TextModel(ModelBase):
logger.info(f"gguf: experts used count = {n_experts_used}") logger.info(f"gguf: experts used count = {n_experts_used}")
if (head_dim := self.hparams.get("head_dim")) is not None: if (head_dim := self.hparams.get("head_dim")) is not None:
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
if self.hparams.get("model_type") != "deepseek_v3":
self.gguf_writer.add_key_length(head_dim) self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim) self.gguf_writer.add_value_length(head_dim)
@ -4798,25 +4801,6 @@ class OlmoeModel(TextModel):
class JinaBertV2Model(BertModel): class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.intermediate_size = self.hparams["intermediate_size"]
def get_tensors(self):
for name, data in super().get_tensors():
if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue
yield name, data
def set_vocab(self): def set_vocab(self):
tokenizer_class = 'BertTokenizer' tokenizer_class = 'BertTokenizer'
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@ -4832,14 +4816,6 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True) self.gguf_writer.add_add_eos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# if name starts with "bert.", remove the prefix
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
if name.startswith("bert."):
name = name[5:]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("OpenELMForCausalLM") @ModelBase.register("OpenELMForCausalLM")
class OpenELMModel(TextModel): class OpenELMModel(TextModel):

View file

@ -518,11 +518,14 @@ void ggml_barrier(struct ggml_threadpool * tp);
#elif defined(__GNUC__) #elif defined(__GNUC__)
// GCC/Clang on *nix // GCC/Clang on *nix
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT # define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
#elif defined(_MSC_VER) && defined (_WIN64) #elif defined(_MSC_VER) && defined(_WIN64)
// MSVC // MSVC
// Note: C name mangling varies across different calling conventions // Note: C name mangling varies across different calling conventions
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170 // see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias)) # define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
#elif defined(_MSC_VER) && defined(WIN32)
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
#else #else
# error "Unsupported compiler for GGML_WEAK_ALIAS" # error "Unsupported compiler for GGML_WEAK_ALIAS"
#endif #endif

View file

@ -3333,8 +3333,6 @@ kernel void kernel_flash_attn_ext(
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
// O = diag(ms)*O // O = diag(ms)*O
{ {
s8x8_t mm; s8x8_t ms;
simdgroup_load(mm, ss + 2*C, TS, 0, false); simdgroup_load(ms, ss + 2*C, TS, 0, false);
#pragma unroll(DV8) #pragma unroll(DV8)
for (short i = 0; i < DV8; ++i) { for (short i = 0; i < DV8; ++i) {
simdgroup_multiply(lo[i], mm, lo[i]); simdgroup_multiply(lo[i], ms, lo[i]);
} }
} }
// O = O + (Q*K^T)*V // O = O + (Q*K^T)*V
{ {
for (short cc = 0; cc < C/8; ++cc) { for (short cc = 0; cc < C/8; ++cc) {
s8x8_t ms; s8x8_t vs;
simdgroup_load(ms, ss + 8*cc, TS, 0, false); simdgroup_load(vs, ss + 8*cc, TS, 0, false);
if (is_same<vd4x4_t, v4x4_t>::value) { if (is_same<vd4x4_t, v4x4_t>::value) {
// we can read directly from global memory // we can read directly from global memory
@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
v8x8_t mv; v8x8_t mv;
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20 simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]); simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
} }
} else { } else {
for (short ii = 0; ii < DV16; ii += 4) { for (short ii = 0; ii < DV16; ii += 4) {
@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
v8x8_t mv; v8x8_t mv;
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
} }
} else { } else {
if (ii + tx < DV16) { if (ii + tx < DV16) {
@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
v8x8_t mv; v8x8_t mv;
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false); simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]); simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false); simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]); simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
} }
} }
} }
@ -3624,83 +3622,80 @@ kernel void kernel_flash_attn_ext(
} }
// these are needed for reducing the results from the simdgroups (reuse the ss buffer) // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
for (short j = 0; j < Q; ++j) { for (short j = tiisg; j < Q; j += NW) {
if (tiisg == 0) {
ss[j*TS + 0] = S[j]; ss[j*TS + 0] = S[j];
ss[j*TS + 1] = M[j]; ss[j*TS + 1] = M[j];
} }
} }
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation
threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
// store result to shared memory in F32
if (sgitg == 0) {
for (short i = 0; i < DV8; ++i) {
//simdgroup_store(lo[i], so + i*8, DV, 0, false);
simdgroup_float8x8 t(1.0f);
simdgroup_multiply(t, lo[i], t);
simdgroup_store(t, so + i*8, DV, 0, false);
} }
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// reduce the warps sequentially // reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) { for (ushort sg = 1; sg < nsg; ++sg) {
threadgroup_barrier(mem_flags::mem_threadgroup);
// each simdgroup stores its output to shared memory, reusing sq
if (sgitg == sg) { if (sgitg == sg) {
for (short i = 0; i < DV8; ++i) { for (short j = tiisg; j < Q; j += NW) {
simdgroup_store(lo[i], so + i*8, DV, 0, false); const float S0 = ss[j*TS - 1*SH + 0];
} const float S1 = ss[j*TS + 0];
}
threadgroup_barrier(mem_flags::mem_threadgroup); const float M0 = ss[j*TS - 1*SH + 1];
const float M1 = ss[j*TS + 1];
// the first simdgroup accumulates the results from the other simdgroups
if (sgitg == 0) {
for (short j = 0; j < Q; ++j) {
const float S0 = ss[j*TS + 0];
const float S1 = ss[j*TS + sg*SH + 0];
const float M0 = ss[j*TS + 1];
const float M1 = ss[j*TS + sg*SH + 1];
const float M = max(M0, M1); const float M = max(M0, M1);
const float ms0 = exp(M0 - M); float ms0 = exp(M0 - M);
const float ms1 = exp(M1 - M); float ms1 = exp(M1 - M);
const float S = S0*ms0 + S1*ms1; const float S = S0*ms0 + S1*ms1;
if (tiisg == 0) {
ss[j*TS + 0] = S; ss[j*TS + 0] = S;
ss[j*TS + 1] = M; ss[j*TS + 1] = M;
ss[j*TS + 2*C + j ] = ms0; ss[j*TS + 2*C + j - 1*SH] = ms0;
ss[j*TS + 2*C + j + sg*SH] = ms1; ss[j*TS + 2*C + j ] = ms1;
}
} }
//simdgroup_barrier(mem_flags::mem_threadgroup);
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
{ {
s8x8_t ms0; s8x8_t ms0;
s8x8_t ms1; s8x8_t ms1;
simdgroup_load(ms0, ss + 2*C, TS, 0, false); simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false); simdgroup_load(ms1, ss + 2*C, TS, 0, false);
#pragma unroll(DV8) #pragma unroll(DV8)
for (short i = 0; i < DV8; ++i) { for (short i = 0; i < DV8; ++i) {
o8x8_t t; simdgroup_float8x8 t;
simdgroup_load (t, so + i*8, DV, 0, false); simdgroup_load (t, so + i*8, DV, 0, false);
simdgroup_multiply(t, ms1, t); simdgroup_multiply(t, ms0, t);
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t); simdgroup_multiply_accumulate(t, ms1, lo[i], t);
simdgroup_store(t, so + i*8, DV, 0, false);
} }
} }
} }
}
// store result to shared memory (reuse sq)
if (sgitg == 0) {
for (short i = 0; i < DV8; ++i) {
simdgroup_store(lo[i], so + i*8, DV, 0, false);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
}
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK); threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
// final rescale with 1/S and store to global memory // final rescale with 1/S and store to global memory
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) { for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
@ -3723,8 +3718,8 @@ kernel void kernel_flash_attn_ext(
half, half4x4, simdgroup_half8x8, \ half, half4x4, simdgroup_half8x8, \
float, simdgroup_float8x8, \ float, simdgroup_float8x8, \
float, simdgroup_float8x8, \ float, simdgroup_float8x8, \
float, float4, simdgroup_float8x8 half, half4, simdgroup_half8x8
//half, half4, simdgroup_half8x8 //float, float4, simdgroup_float8x8
#define FA_TYPES_BF \ #define FA_TYPES_BF \
bfloat, bfloat4, simdgroup_bfloat8x8, \ bfloat, bfloat4, simdgroup_bfloat8x8, \
@ -3732,8 +3727,8 @@ kernel void kernel_flash_attn_ext(
bfloat, bfloat4x4, simdgroup_bfloat8x8, \ bfloat, bfloat4x4, simdgroup_bfloat8x8, \
float, simdgroup_float8x8, \ float, simdgroup_float8x8, \
float, simdgroup_float8x8, \ float, simdgroup_float8x8, \
float, float4, simdgroup_float8x8 half, half4, simdgroup_half8x8
//half, half4, simdgroup_half8x8 //float, float4, simdgroup_float8x8
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t; typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;

View file

@ -0,0 +1,283 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK4_0 32
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
//------------------------------------------------------------------------------
// block_q4_0
//------------------------------------------------------------------------------
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
};
// This function requires the original shuffled weights.
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
// packed together in a byte, so are (q[1], q[17]) and so on.
inline float block_q_4_0_dot_y_flat(
global uchar * x,
global half * dh,
float sumy,
float16 yl,
int il
) {
float d = *dh;
global ushort * qs = ((global ushort *)x + il/2);
float acc = 0.f;
acc += yl.s0 * (qs[0] & 0x000F);
acc += yl.s1 * (qs[0] & 0x0F00);
acc += yl.s8 * (qs[0] & 0x00F0);
acc += yl.s9 * (qs[0] & 0xF000);
acc += yl.s2 * (qs[1] & 0x000F);
acc += yl.s3 * (qs[1] & 0x0F00);
acc += yl.sa * (qs[1] & 0x00F0);
acc += yl.sb * (qs[1] & 0xF000);
acc += yl.s4 * (qs[2] & 0x000F);
acc += yl.s5 * (qs[2] & 0x0F00);
acc += yl.sc * (qs[2] & 0x00F0);
acc += yl.sd * (qs[2] & 0xF000);
acc += yl.s6 * (qs[3] & 0x000F);
acc += yl.s7 * (qs[3] & 0x0F00);
acc += yl.se * (qs[3] & 0x00F0);
acc += yl.sf * (qs[3] & 0xF000);
return d * (sumy * -8.f + acc);
}
//
// This variant outputs 8 values.
//
#undef N_DST
#undef N_SIMDGROUP
#undef N_SIMDWIDTH
#ifdef INTEL_GPU
#define N_DST 8 // each SIMD group works on 8 rows
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 16 // subgroup size
#elif defined (ADRENO_GPU)
#define N_DST 8
#define N_SIMDGROUP 1
#define N_SIMDWIDTH 64
#endif
inline void mul_vec_q_n_f32_8x_flat(
global char * src0_q,
global half * src0_d,
global float * src1,
global float * dst,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = 0;
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
float16 yl;
float8 sumf = 0.f;
int ix = get_sub_group_local_id()/2;
int il = 8*(get_sub_group_local_id()%2);
global float * yb = y + ix*QK4_0 + il;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
float sumy = 0.f;
sumy += yb[0];
sumy += yb[1];
sumy += yb[2];
sumy += yb[3];
sumy += yb[4];
sumy += yb[5];
sumy += yb[6];
sumy += yb[7];
sumy += yb[16];
sumy += yb[17];
sumy += yb[18];
sumy += yb[19];
sumy += yb[20];
sumy += yb[21];
sumy += yb[22];
sumy += yb[23];
yl.s0 = yb[0];
yl.s1 = yb[1]/256.f;
yl.s2 = yb[2];
yl.s3 = yb[3]/256.f;
yl.s4 = yb[4];
yl.s5 = yb[5]/256.f;
yl.s6 = yb[6];
yl.s7 = yb[7]/256.f;
yl.s8 = yb[16]/16.f;
yl.s9 = yb[17]/4096.f;
yl.sa = yb[18]/16.f;
yl.sb = yb[19]/4096.f;
yl.sc = yb[20]/16.f;
yl.sd = yb[21]/4096.f;
yl.se = yb[22]/16.f;
yl.sf = yb[23]/4096.f;
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
yb += QK4_0 * (N_SIMDWIDTH/2);
}
float8 tot = (float8)(
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
);
if (get_sub_group_local_id() == 0) {
if (first_row + 0 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
}
if (first_row + 1 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
}
if (first_row + 2 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
}
if (first_row + 3 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
}
if (first_row + 4 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
}
if (first_row + 5 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
}
if (first_row + 6 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
}
if (first_row + 7 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
}
}
}
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
global char * src0_q,
global half * src0_d,
global float * src1,
ulong offset1,
global char * src2,
ulong offset2,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb02,
int ne10,
int ne11,
int ne12,
ulong nb11,
ulong nb12,
int ne20,
int ne21,
ulong nb21,
int ne0,
int ne1,
int r2,
int r3
) {
src1 = (global float *)((global char *)src1 + offset1);
src2 = (global char *)((global char *)src2 + offset2);
dst = (global float *)((global char *)dst + offsetd);
const int iid1 = get_group_id(2)/ne20;
const int idx = get_group_id(2)%ne20;
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
const int i11 = idx%ne11;
const int i12 = iid1;
const int i1 = idx;
const int i2 = i12;
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
}

File diff suppressed because it is too large Load diff

View file

@ -333,7 +333,9 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc11", # nomic-bert
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
"model.layers.{bid}.mlp.c_fc", # starcoder2 "model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
"model.layers.{bid}.residual_mlp.w3", # arctic "model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
"transformer.h.{bid}.mlp.c_fc_1", # exaone "transformer.h.{bid}.mlp.c_fc_1", # exaone
@ -370,7 +372,7 @@ class TensorNameMap:
"model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.layers.{bid}.mlp.gate_proj", # plamo
"model.layers.{bid}.feed_forward.w1", # internlm2 "model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert "encoder.layers.{bid}.mlp.fc12", # nomic-bert
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
"transformer.h.{bid}.mlp.linear_1", # refact "transformer.h.{bid}.mlp.linear_1", # refact
"model.layers.{bid}.residual_mlp.w1", # arctic "model.layers.{bid}.residual_mlp.w1", # arctic
"transformer.h.{bid}.mlp.c_fc_0", # exaone "transformer.h.{bid}.mlp.c_fc_0", # exaone

View file

@ -250,22 +250,6 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
} }
} }
void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
GGML_UNUSED(ubatch);
const int64_t n_kv = kv_state->get_n_kv();
if (s_mask) {
GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
float * data = (float *) s_mask->data;
// clear unused states
for (int i = 0; i < n_kv; ++i) {
data[i] = kv_state->s_mask(i);
}
}
}
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
GGML_UNUSED(ubatch); GGML_UNUSED(ubatch);
@ -650,6 +634,7 @@ ggml_tensor * llm_graph_context::build_ffn(
{ {
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
int64_t split_point = cur->ne[0] / 2; int64_t split_point = cur->ne[0] / 2;
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
@ -663,7 +648,7 @@ ggml_tensor * llm_graph_context::build_ffn(
{ {
// Split into two equal parts // Split into two equal parts
int64_t split_point = cur->ne[0] / 2; int64_t split_point = cur->ne[0] / 2;
// TODO: these conts should not be needed // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
@ -986,23 +971,6 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
return cur; return cur;
} }
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
const auto n_kv = kv_state->get_n_kv();
auto & cur = inp->s_mask;
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
ggml_set_input(cur);
res->add_input(std::move(inp));
return cur;
}
ggml_tensor * llm_graph_context::build_inp_cross_embd() const { ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
auto inp = std::make_unique<llm_graph_input_cross_embd>(cross); auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
@ -1455,43 +1423,53 @@ ggml_tensor * llm_graph_context::build_attn(
return cur; return cur;
} }
ggml_tensor * llm_graph_context::build_copy_mask_state( ggml_tensor * llm_graph_context::build_recurrent_state(
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * s, ggml_tensor * s,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask, int32_t state_size,
int32_t n_state, int32_t n_seqs,
int32_t n_seqs) const { bool avoid_copies) const {
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate); const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
const auto n_kv = kv_state->get_n_kv(); const auto n_kv = kv_state->get_n_kv();
const auto kv_head = kv_state->get_head(); const auto kv_head = kv_state->get_head();
const auto rs_zero = kv_state->get_rs_z();
ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size()); ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_state->get_size());
// Clear a single state which will then be copied to the other cleared states.
// Note that this is a no-op when the view is zero-sized.
ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
ggml_tensor * output_states;
if (!avoid_copies) {
// copy states // copy states
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
// this shrinks the tensors's ne[1] to n_kv // {state_size, kv_size} -> {state_size, n_seqs}
states = ggml_get_rows(ctx0, states, state_copy); output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
ggml_build_forward_expand(gf, output_states);
} else {
// FIXME: make the gathering operation happen before the copy below
// (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
output_states = states;
}
// clear states of sequences which are starting at the beginning of this batch // copy extra states which won't be changed further (between n_seqs and n_kv)
// FIXME: zero-out NANs? ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
states = ggml_mul(ctx0, states, state_mask);
// copy states which won't be changed further (between n_seqs and n_kv)
ggml_build_forward_expand(gf, ggml_build_forward_expand(gf,
ggml_cpy(ctx0, ggml_cpy(ctx0,
ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), states_extra,
ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
// the part of the states that will be used and modified return output_states;
return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
} }
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
int il) const { int il) const {
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate); const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -1502,8 +1480,8 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
ggml_tensor * token_shift_all = kv_state->get_k_l(il); ggml_tensor * token_shift_all = kv_state->get_k_l(il);
ggml_tensor * token_shift = build_copy_mask_state( ggml_tensor * token_shift = build_recurrent_state(
gf, token_shift_all, state_copy, state_mask, gf, token_shift_all, state_copy,
hparams.n_embd_k_s(), n_seqs); hparams.n_embd_k_s(), n_seqs);
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);

View file

@ -200,18 +200,6 @@ public:
const llama_kv_cache_recurrent_state * kv_state; const llama_kv_cache_recurrent_state * kv_state;
}; };
class llm_graph_input_s_mask : public llm_graph_input_i {
public:
llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
virtual ~llm_graph_input_s_mask() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * s_mask; // F32 [1, n_kv]
const llama_kv_cache_recurrent_state * kv_state;
};
class llm_graph_input_cross_embd : public llm_graph_input_i { class llm_graph_input_cross_embd : public llm_graph_input_i {
public: public:
llm_graph_input_cross_embd( llm_graph_input_cross_embd(
@ -521,7 +509,6 @@ struct llm_graph_context {
ggml_tensor * build_inp_mean() const; ggml_tensor * build_inp_mean() const;
ggml_tensor * build_inp_cls() const; ggml_tensor * build_inp_cls() const;
ggml_tensor * build_inp_s_copy() const; ggml_tensor * build_inp_s_copy() const;
ggml_tensor * build_inp_s_mask() const;
ggml_tensor * build_inp_cross_embd() const; ggml_tensor * build_inp_cross_embd() const;
ggml_tensor * build_inp_pos_bucket_enc() const; ggml_tensor * build_inp_pos_bucket_enc() const;
@ -606,18 +593,17 @@ struct llm_graph_context {
// recurrent // recurrent
// //
ggml_tensor * build_copy_mask_state( ggml_tensor * build_recurrent_state(
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * s, ggml_tensor * s,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask, int32_t state_size,
int32_t n_state, int32_t n_seqs,
int32_t n_seqs) const; bool avoid_copies = false) const;
ggml_tensor * build_rwkv_token_shift_load( ggml_tensor * build_rwkv_token_shift_load(
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
int il) const; int il) const;

View file

@ -406,21 +406,12 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
bool success = true; bool success = true;
// TODO: here we have to verify that all ubatches can fit in the cells for (const auto & ubatch : ubatches) {
// however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells if (!find_slot(ubatch)) {
// during the compute of each ubatch. to reproduce, uncomment the following loop and run: success = false;
// break;
// $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8 }
// }
// recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
//
GGML_UNUSED(ubatches);
//for (const auto & ubatch : ubatches) {
// if (!find_slot(ubatch)) {
// success = false;
// break;
// }
//}
// restore the original state // restore the original state
cells = std::move(org_cells); cells = std::move(org_cells);
@ -431,14 +422,13 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
} }
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) { bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
const uint32_t n_tokens = ubatch.n_tokens;
const uint32_t n_seqs = ubatch.n_seqs; const uint32_t n_seqs = ubatch.n_seqs;
const uint32_t n_seq_tokens = ubatch.n_seq_tokens; const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
// if we have enough unused cells before the current head -> // if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it // better to start searching from the beginning of the cache, hoping to fill it
if (head > used + 2*n_tokens) { if (head > used + 2*n_seqs) {
head = 0; head = 0;
} }
@ -534,16 +524,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
empty_cell.src = orig_cell.src; empty_cell.src = orig_cell.src;
orig_cell.seq_id.erase(seq_id); orig_cell.seq_id.erase(seq_id);
empty_cell.seq_id.insert(seq_id); // will be overwritten empty_cell.seq_id.insert(seq_id); // will be overwritten
GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
} }
seq_meta.tail = next_empty_cell; seq_meta.tail = next_empty_cell;
// find next empty cell // find next empty cell
if (s + 1 < n_seqs) { if (s + 1 < n_seqs) {
next_empty_cell += 1;
for (uint32_t i = 0; i < size; ++i) { for (uint32_t i = 0; i < size; ++i) {
next_empty_cell += 1;
if (next_empty_cell >= size) { next_empty_cell -= size; } if (next_empty_cell >= size) { next_empty_cell -= size; }
kv_cell & cell = cells[next_empty_cell]; kv_cell & cell = cells[next_empty_cell];
if (cell.is_empty()) { break; } if (cell.is_empty()) { break; }
next_empty_cell += 1;
} }
} }
} }
@ -553,8 +543,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
// gather and re-order // gather and re-order
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
int32_t dst_id = s + min; const int32_t dst_id = s + min;
int32_t src_id = cells[ubatch.seq_id[s][0]].tail; const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
if (dst_id != src_id) { if (dst_id != src_id) {
kv_cell & dst_cell = cells[dst_id]; kv_cell & dst_cell = cells[dst_id];
kv_cell & src_cell = cells[src_id]; kv_cell & src_cell = cells[src_id];
@ -563,12 +553,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
std::swap(dst_cell.src, src_cell.src); std::swap(dst_cell.src, src_cell.src);
std::swap(dst_cell.seq_id, src_cell.seq_id); std::swap(dst_cell.seq_id, src_cell.seq_id);
// swap tails (assuming they NEVER overlap) // swap tails
for (const llama_seq_id seq_id : src_cell.seq_id) { for (uint32_t i = 0; i < size; ++i) {
cells[seq_id].tail = src_id; int32_t & tail = cells[i].tail;
if (tail == src_id) {
tail = dst_id;
} else if (tail == dst_id) {
tail = src_id;
} }
for (const llama_seq_id seq_id : dst_cell.seq_id) {
cells[seq_id].tail = dst_id;
} }
} }
} }
@ -576,7 +568,7 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
// update the pos of the used seqs // update the pos of the used seqs
for (uint32_t s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
int32_t cell_id = s + min; const int32_t cell_id = s + min;
kv_cell & cell = cells[cell_id]; kv_cell & cell = cells[cell_id];
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
@ -594,6 +586,38 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
} }
} }
// Find first cell without src refs, to use as the zero-ed state
{
// TODO: bake-in src refcounts in the cell metadata
std::vector<int32_t> refcounts(size, 0);
for (size_t i = 0; i < size; ++i) {
const int32_t src = cells[i].src;
if (src >= 0) {
refcounts[src] += 1;
}
}
rs_z = -1;
for (int i = min; i <= max; ++i) {
if (refcounts[i] == 0) {
rs_z = i;
break;
}
}
for (int i = min; i <= max; ++i) {
if (cells[i].src < 0) {
GGML_ASSERT(rs_z >= 0);
cells[i].src0 = rs_z;
} else {
// Stage the source ids for all used cells to allow correct seq_* behavior
// and still make these values available when setting the inputs
cells[i].src0 = cells[i].src;
}
cells[i].src = i; // avoid moving or clearing twice
}
}
// allow getting the range of used cells, from head to head + n // allow getting the range of used cells, from head to head + n
head = min; head = min;
n = max - min + 1; n = max - min + 1;
@ -605,47 +629,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
} }
bool llama_kv_cache_recurrent::get_can_shift() const { bool llama_kv_cache_recurrent::get_can_shift() const {
return false; // shifting the pos is trivial for recurrent models
} return true;
int32_t llama_kv_cache_recurrent::s_copy(int i) const {
const uint32_t cell_id = i + head;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
// prevent out-of-bound sources
if (cell.src < 0 || (uint32_t) cell.src >= size) {
cell.src = cell_id;
}
int32_t res = cell.src;
// TODO: do not mutate the KV cache
// ensure copy only happens once
if (cell.src != (int32_t) cell_id) {
cell.src = cell_id;
}
return res;
}
float llama_kv_cache_recurrent::s_mask(int i) const {
const uint32_t cell_id = i + head;
//////////////////////////////////////////////
// TODO: this should not mutate the KV cache !
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
float res = (float) (cell.src >= 0);
// only clear once
if (cell.src < 0) {
cell.src = cell_id;
}
return res;
} }
size_t llama_kv_cache_recurrent::total_size() const { size_t llama_kv_cache_recurrent::total_size() const {
@ -1111,6 +1096,10 @@ uint32_t llama_kv_cache_recurrent_state::get_head() const {
return is_full ? 0 : kv->head; return is_full ? 0 : kv->head;
} }
int32_t llama_kv_cache_recurrent_state::get_rs_z() const {
return is_full ? 0 : kv->rs_z;
}
uint32_t llama_kv_cache_recurrent_state::get_size() const { uint32_t llama_kv_cache_recurrent_state::get_size() const {
return kv->size; return kv->size;
} }
@ -1124,9 +1113,5 @@ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
} }
int32_t llama_kv_cache_recurrent_state::s_copy(int i) const { int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
return kv->s_copy(i); return kv->cells[i + kv->head].src0;
}
float llama_kv_cache_recurrent_state::s_mask(int i) const {
return kv->s_mask(i);
} }

View file

@ -57,10 +57,6 @@ public:
bool get_can_shift() const override; bool get_can_shift() const override;
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
int32_t s_copy(int i) const;
float s_mask(int i) const;
// state write/load // state write/load
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
@ -73,10 +69,14 @@ public:
// computed before each graph build // computed before each graph build
uint32_t n = 0; uint32_t n = 0;
// first zero-ed state
int32_t rs_z = -1;
// TODO: optimize for recurrent state needs // TODO: optimize for recurrent state needs
struct kv_cell { struct kv_cell {
llama_pos pos = -1; llama_pos pos = -1;
int32_t src = -1; // used to copy states int32_t src = -1; // used to know where states should be copied from
int32_t src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
int32_t tail = -1; int32_t tail = -1;
std::set<llama_seq_id> seq_id; std::set<llama_seq_id> seq_id;
@ -157,13 +157,13 @@ public:
uint32_t get_n_kv() const; uint32_t get_n_kv() const;
uint32_t get_head() const; uint32_t get_head() const;
int32_t get_rs_z() const;
uint32_t get_size() const; uint32_t get_size() const;
ggml_tensor * get_k_l(int32_t il) const; ggml_tensor * get_k_l(int32_t il) const;
ggml_tensor * get_v_l(int32_t il) const; ggml_tensor * get_v_l(int32_t il) const;
int32_t s_copy(int i) const; int32_t s_copy(int i) const;
float s_mask(int i) const;
private: private:
const llama_memory_status status; const llama_memory_status status;

View file

@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
} }
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
} }
void llama_kv_cache_unified::clear(bool data) { void llama_kv_cache_unified::clear(bool data) {
@ -462,7 +465,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
for (uint32_t i = 0; i < n_kv; ++i) { for (uint32_t i = 0; i < n_kv; ++i) {
assert(dinfo.ids[i] <= n_kv); assert(dinfo.ids[i] <= n_kv);
if (dinfo.ids[i] == n_kv) { if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
continue; continue;
} }
@ -512,21 +515,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
head_cur = 0; head_cur = 0;
} }
// otherwise, one cell per token.
if (n_tokens > cells.size()) { if (n_tokens > cells.size()) {
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size()); LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
return -1; return -1;
} }
//#define FIND_SLOT_DEBUG 1 if (debug > 0) {
#if FIND_SLOT_DEBUG LLAMA_LOG_CONT("\n");
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa); LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
// for debugging if ((debug == 2 && n_swa > 0) || debug > 2) {
{
std::string ss; std::string ss;
if (n_swa > 0) {
for (uint32_t i = 0; i < cells.size(); ++i) { for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.is_empty(i)) { if (cells.is_empty(i)) {
ss += '.'; ss += '.';
@ -534,11 +533,35 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
ss += std::to_string(cells.seq_get(i)); ss += std::to_string(cells.seq_get(i));
} }
if (i%256 == 255) { if (i%256 == 255) {
ss += " *";
ss += '\n'; ss += '\n';
} }
} }
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
} }
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
std::string cur;
if (cells.is_empty(i)) {
cur = '.';
} else {
cur = std::to_string(cells.pos_get(i));
}
const int n = cur.size();
for (int j = 0; j < 5 - n; ++j) {
cur += ' ';
}
ss += cur;
if (i%256 == 255) {
ss += " *";
}
if (i%64 == 63) {
ss += '\n';
}
}
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
} }
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) { for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
@ -546,9 +569,9 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
continue; continue;
} }
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s)); LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
}
} }
#endif
uint32_t n_tested = 0; uint32_t n_tested = 0;
@ -559,21 +582,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
continue; continue;
} }
// keep track of what the minimum sequence positions would be if we accept the ubatch
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
seq_pos_min[s] = cells.seq_pos_min(s);
}
bool found = true; bool found = true;
for (uint32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
const llama_pos pos = ubatch.pos[i]; //const llama_pos pos = ubatch.pos[i];
const llama_seq_id seq_id = ubatch.seq_id[i][0]; //const llama_seq_id seq_id = ubatch.seq_id[i][0];
// can we use this cell? either: // can we use this cell? either:
// - the cell is empty // - the cell is empty
// - the cell is occupied only by one sequence: // - the cell is occupied only by one sequence:
// - mask causally, if the sequence is the same as the one we are inserting // - (disabled) mask causally, if the sequence is the same as the one we are inserting
// - mask SWA, using current max pos for that sequence in the cache // - mask SWA, using current max pos for that sequence in the cache
// always insert in the cell with minimum pos // always insert in the cell with minimum pos
bool can_use = cells.is_empty(head_cur + i); bool can_use = cells.is_empty(head_cur + i);
@ -581,21 +598,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
if (!can_use && cells.seq_count(head_cur + i) == 1) { if (!can_use && cells.seq_count(head_cur + i) == 1) {
const llama_pos pos_cell = cells.pos_get(head_cur + i); const llama_pos pos_cell = cells.pos_get(head_cur + i);
// causal mask // (disabled) causal mask
if (cells.seq_has(head_cur + i, seq_id)) { // note: it's better to purge any "future" tokens beforehand
can_use = pos_cell >= pos; //if (cells.seq_has(head_cur + i, seq_id)) {
} // can_use = pos_cell >= pos;
//}
if (!can_use) { if (!can_use) {
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i); const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
// SWA mask // SWA mask
// note: we insert only in the cell with minimum pos in order to preserve the invariant that if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
if (pos_cell == seq_pos_min[seq_id_cell] &&
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
seq_pos_min[seq_id_cell]++;
can_use = true; can_use = true;
} }
} }
@ -623,8 +636,22 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
} }
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) { void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
// keep track of the max sequence position that we would overwrite with this ubatch
// for non-SWA cache, this would be always empty
llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
seq_pos_max_rm[s] = -1;
}
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
if (!cells.is_empty(head_cur + i)) { if (!cells.is_empty(head_cur + i)) {
assert(cells.seq_count(head_cur + i) == 1);
const llama_seq_id seq_id = cells.seq_get(head_cur + i);
const llama_pos pos = cells.pos_get(head_cur + i);
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
cells.rm(head_cur + i); cells.rm(head_cur + i);
} }
@ -635,6 +662,22 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
} }
} }
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (seq_pos_max_rm[s] == -1) {
continue;
}
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
}
}
// move the head at the end of the slot // move the head at the end of the slot
head = head_cur + ubatch.n_tokens; head = head_cur + ubatch.n_tokens;
} }
@ -944,11 +987,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
const auto & n_embd_head_k = hparams.n_embd_head_k; const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v; //const auto & n_embd_head_v = hparams.n_embd_head_v;
//GGML_ASSERT(kv_self->size == n_ctx);
auto inp = std::make_unique<llm_graph_input_k_shift>(this); auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx); inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
ggml_set_input(inp->k_shift); ggml_set_input(inp->k_shift);
for (const auto & layer : layers) { for (const auto & layer : layers) {

View file

@ -158,6 +158,8 @@ private:
// SWA // SWA
const uint32_t n_swa = 0; const uint32_t n_swa = 0;
int debug = 0;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs; std::vector<ggml_context_ptr> ctxs;

View file

@ -80,6 +80,9 @@ public:
assert(isrc < pos.size()); assert(isrc < pos.size());
assert(idst < pos.size()); assert(idst < pos.size());
assert(pos[idst] == -1);
assert(pos[isrc] != -1);
pos [idst] = pos [isrc]; pos [idst] = pos [isrc];
shift[idst] = shift[isrc]; shift[idst] = shift[isrc];
seq [idst] = seq [isrc]; seq [idst] = seq [isrc];
@ -144,9 +147,10 @@ public:
assert(pos[i] != -1); assert(pos[i] != -1);
seq_pos_rm(i); seq_pos_rm(i);
seq[i].reset();
pos[i] = -1; pos[i] = -1;
seq[i].reset(); shift[i] = 0;
used.erase(i); used.erase(i);
} }
@ -164,6 +168,7 @@ public:
if (seq[i].none()) { if (seq[i].none()) {
pos[i] = -1; pos[i] = -1;
shift[i] = 0;
used.erase(i); used.erase(i);
@ -192,6 +197,7 @@ public:
seq[i].reset(); seq[i].reset();
pos[i] = -1; pos[i] = -1;
shift[i] = 0;
used.erase(i); used.erase(i);
@ -317,21 +323,20 @@ public:
pos[i] += d; pos[i] += d;
shift[i] += d; shift[i] += d;
seq_pos_add(i);
has_shift = true; has_shift = true;
if (pos[i] < 0) { if (pos[i] < 0) {
seq_pos_rm(i);
seq[i].reset(); seq[i].reset();
pos[i] = -1; pos[i] = -1;
shift[i] = 0;
used.erase(i); used.erase(i);
return true; return true;
} }
seq_pos_add(i);
return false; return false;
} }

View file

@ -2320,8 +2320,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@ -6143,7 +6143,7 @@ struct llm_build_bert : public llm_graph_context {
model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_PAR, il); model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} else { } else {
cur = build_ffn(cur, cur = build_ffn(cur,
@ -8957,7 +8957,6 @@ struct llm_build_mamba : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
ggml_tensor * state_copy = build_inp_s_copy(); ggml_tensor * state_copy = build_inp_s_copy();
ggml_tensor * state_mask = build_inp_s_mask();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
// norm // norm
@ -8966,8 +8965,7 @@ struct llm_build_mamba : public llm_graph_context {
LLM_NORM_RMS, il); LLM_NORM_RMS, il);
cb(cur, "attn_norm", il); cb(cur, "attn_norm", il);
//cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
if (il == n_layer - 1) { if (il == n_layer - 1) {
// skip computing output for unused tokens // skip computing output for unused tokens
@ -9008,7 +9006,6 @@ struct llm_build_mamba : public llm_graph_context {
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * cur,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
int il) const { int il) const {
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate); const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -9035,12 +9032,12 @@ struct llm_build_mamba : public llm_graph_context {
ggml_tensor * ssm_states_all = kv_state->get_v_l(il); ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
// (ab)using the KV cache to store the states // (ab)using the KV cache to store the states
ggml_tensor * conv = build_copy_mask_state( ggml_tensor * conv = build_recurrent_state(
gf, conv_states_all, state_copy, state_mask, gf, conv_states_all, state_copy,
hparams.n_embd_k_s(), n_seqs); hparams.n_embd_k_s(), n_seqs);
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
ggml_tensor * ssm = build_copy_mask_state( ggml_tensor * ssm = build_recurrent_state(
gf, ssm_states_all, state_copy, state_mask, gf, ssm_states_all, state_copy,
hparams.n_embd_v_s(), n_seqs); hparams.n_embd_v_s(), n_seqs);
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
@ -11756,7 +11753,6 @@ struct llm_build_rwkv6_base : public llm_graph_context {
ggml_tensor * cur, ggml_tensor * cur,
ggml_tensor * x_prev, ggml_tensor * x_prev,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
int il) const { int il) const {
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate); const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
@ -11880,8 +11876,8 @@ struct llm_build_rwkv6_base : public llm_graph_context {
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
} }
ggml_tensor * wkv_state = build_copy_mask_state( ggml_tensor * wkv_state = build_recurrent_state(
gf, kv_state->get_v_l(il), state_copy, state_mask, gf, kv_state->get_v_l(il), state_copy,
hparams.n_embd_v_s(), n_seqs); hparams.n_embd_v_s(), n_seqs);
ggml_tensor * wkv_output; ggml_tensor * wkv_output;
@ -11937,7 +11933,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
ggml_tensor * state_copy = build_inp_s_copy(); ggml_tensor * state_copy = build_inp_s_copy();
ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -11948,7 +11943,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * token_shift = build_rwkv_token_shift_load( ggml_tensor * token_shift = build_rwkv_token_shift_load(
gf, state_copy, state_mask, ubatch, il gf, state_copy, ubatch, il
); );
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@ -11964,7 +11959,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
1 1
); );
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il); cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
@ -12035,7 +12030,6 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
ggml_tensor * state_copy = build_inp_s_copy(); ggml_tensor * state_copy = build_inp_s_copy();
ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12046,7 +12040,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * token_shift = build_rwkv_token_shift_load( ggml_tensor * token_shift = build_rwkv_token_shift_load(
gf, state_copy, state_mask, ubatch, il gf, state_copy, ubatch, il
); );
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@ -12059,7 +12053,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
1 1
); );
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il); cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@ -12151,7 +12145,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
ggml_tensor * cur, ggml_tensor * cur,
ggml_tensor * x_prev, ggml_tensor * x_prev,
ggml_tensor * state_copy, ggml_tensor * state_copy,
ggml_tensor * state_mask,
ggml_tensor *& first_layer_value, ggml_tensor *& first_layer_value,
const llama_ubatch & ubatch, const llama_ubatch & ubatch,
int il) const { int il) const {
@ -12234,8 +12227,8 @@ struct llm_build_rwkv7_base : public llm_graph_context {
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens); a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
ggml_tensor * wkv_state = build_copy_mask_state( ggml_tensor * wkv_state = build_recurrent_state(
gf, kv_state->get_v_l(il), state_copy, state_mask, gf, kv_state->get_v_l(il), state_copy,
hparams.n_embd_v_s(), n_seqs); hparams.n_embd_v_s(), n_seqs);
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state); ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@ -12293,7 +12286,6 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
ggml_tensor * state_copy = build_inp_s_copy(); ggml_tensor * state_copy = build_inp_s_copy();
ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12304,7 +12296,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * token_shift = build_rwkv_token_shift_load( ggml_tensor * token_shift = build_rwkv_token_shift_load(
gf, state_copy, state_mask, ubatch, il gf, state_copy, ubatch, il
); );
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@ -12320,7 +12312,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
1 1
); );
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il); cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
@ -12387,7 +12379,6 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
inpL = build_inp_embd(model.tok_embd); inpL = build_inp_embd(model.tok_embd);
ggml_tensor * state_copy = build_inp_s_copy(); ggml_tensor * state_copy = build_inp_s_copy();
ggml_tensor * state_mask = build_inp_s_mask();
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seq_tokens = ubatch.n_seq_tokens;
@ -12398,7 +12389,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
ggml_tensor * token_shift = build_rwkv_token_shift_load( ggml_tensor * token_shift = build_rwkv_token_shift_load(
gf, state_copy, state_mask, ubatch, il gf, state_copy, ubatch, il
); );
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@ -12411,7 +12402,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
1 1
); );
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il); cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));

36
tests/test-tokenizers-repo.sh Executable file
View file

@ -0,0 +1,36 @@
#!/bin/bash
if [ $# -lt 2 ]; then
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
exit 1
fi
if [ $# -eq 3 ]; then
toktest=$3
else
toktest="./test-tokenizer-0"
fi
if [ ! -x $toktest ]; then
printf "Test executable \"$toktest\" not found!\n"
exit 1
fi
repo=$1
folder=$2
if [ -d $folder ] && [ -d $folder/.git ]; then
(cd $folder; git pull)
else
git clone $repo $folder
fi
shopt -s globstar
for gguf in $folder/**/*.gguf; do
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
$toktest $gguf
else
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
fi
done

Binary file not shown.

View file

@ -233,6 +233,7 @@ struct server_task {
slot_params defaults; slot_params defaults;
defaults.sampling = params_base.sampling; defaults.sampling = params_base.sampling;
defaults.speculative = params_base.speculative; defaults.speculative = params_base.speculative;
defaults.n_keep = params_base.n_keep;
// enabling this will output extra debug information in the HTTP responses from the server // enabling this will output extra debug information in the HTTP responses from the server
params.verbose = params_base.verbosity > 9; params.verbose = params_base.verbosity > 9;
@ -2060,6 +2061,7 @@ struct server_context {
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
slot.params.sampling = params_base.sampling; slot.params.sampling = params_base.sampling;
slot.params.n_keep = params_base.n_keep;
slot.callback_on_release = [this](int) { slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task(); queue_tasks.pop_deferred_task();
@ -3556,9 +3558,6 @@ struct server_context {
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens(); const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id); llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
// keep track of total number of tokens generated in the draft
slot.n_draft_total += draft.size();
// ignore small drafts // ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) { if (slot.params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@ -3566,6 +3565,9 @@ struct server_context {
continue; continue;
} }
// keep track of total number of drafted tokens tested
slot.n_draft_total += draft.size();
// construct the speculation batch // construct the speculation batch
common_batch_clear(slot.batch_spec); common_batch_clear(slot.batch_spec);
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
@ -3584,7 +3586,7 @@ struct server_context {
slot.n_past += ids.size(); slot.n_past += ids.size();
slot.n_decoded += ids.size(); slot.n_decoded += ids.size();
// update how many tokens out of draft was accepted // update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1; slot.n_draft_accepted += ids.size() - 1;
slot.cache_tokens.push_back(id); slot.cache_tokens.push_back(id);

View file

@ -41,6 +41,10 @@ html {
max-width: 900px; max-width: 900px;
} }
.chat-bubble {
@apply break-words;
}
.chat-bubble-base-300 { .chat-bubble-base-300 {
--tw-bg-opacity: 1; --tw-bg-opacity: 1;
--tw-text-opacity: 1; --tw-text-opacity: 1;