mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-12 01:54:37 +00:00
Merge commit 'd4e0d95cf5
' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # common/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp # scripts/sync-ggml.last # tests/CMakeLists.txt
This commit is contained in:
commit
69e4a32ca2
18 changed files with 870 additions and 538 deletions
|
@ -556,6 +556,9 @@ class TextModel(ModelBase):
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
|
|
||||||
if (head_dim := self.hparams.get("head_dim")) is not None:
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||||
|
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
|
||||||
|
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
|
||||||
|
if self.hparams.get("model_type") != "deepseek_v3":
|
||||||
self.gguf_writer.add_key_length(head_dim)
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
self.gguf_writer.add_value_length(head_dim)
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
|
|
||||||
|
@ -4798,25 +4801,6 @@ class OlmoeModel(TextModel):
|
||||||
class JinaBertV2Model(BertModel):
|
class JinaBertV2Model(BertModel):
|
||||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.intermediate_size = self.hparams["intermediate_size"]
|
|
||||||
|
|
||||||
def get_tensors(self):
|
|
||||||
for name, data in super().get_tensors():
|
|
||||||
if 'gated_layer' in name:
|
|
||||||
d1 = data[:self.intermediate_size, :]
|
|
||||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
|
||||||
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
|
||||||
d2 = data[self.intermediate_size:, :]
|
|
||||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
|
||||||
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
|
||||||
yield name1, d1
|
|
||||||
yield name2, d2
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield name, data
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
tokenizer_class = 'BertTokenizer'
|
tokenizer_class = 'BertTokenizer'
|
||||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
|
@ -4832,14 +4816,6 @@ class JinaBertV2Model(BertModel):
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
||||||
# if name starts with "bert.", remove the prefix
|
|
||||||
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
|
||||||
if name.startswith("bert."):
|
|
||||||
name = name[5:]
|
|
||||||
|
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("OpenELMForCausalLM")
|
@ModelBase.register("OpenELMForCausalLM")
|
||||||
class OpenELMModel(TextModel):
|
class OpenELMModel(TextModel):
|
||||||
|
|
|
@ -523,6 +523,9 @@ void ggml_barrier(struct ggml_threadpool * tp);
|
||||||
// Note: C name mangling varies across different calling conventions
|
// Note: C name mangling varies across different calling conventions
|
||||||
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
||||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
||||||
|
#elif defined(_MSC_VER) && defined(WIN32)
|
||||||
|
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
|
||||||
|
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
|
||||||
#else
|
#else
|
||||||
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -3333,8 +3333,6 @@ kernel void kernel_flash_attn_ext(
|
||||||
|
|
||||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
|
||||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
|
|
||||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||||
|
|
||||||
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
||||||
|
@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
|
||||||
|
|
||||||
// O = diag(ms)*O
|
// O = diag(ms)*O
|
||||||
{
|
{
|
||||||
s8x8_t mm;
|
s8x8_t ms;
|
||||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
simdgroup_load(ms, ss + 2*C, TS, 0, false);
|
||||||
|
|
||||||
#pragma unroll(DV8)
|
#pragma unroll(DV8)
|
||||||
for (short i = 0; i < DV8; ++i) {
|
for (short i = 0; i < DV8; ++i) {
|
||||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
simdgroup_multiply(lo[i], ms, lo[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// O = O + (Q*K^T)*V
|
// O = O + (Q*K^T)*V
|
||||||
{
|
{
|
||||||
for (short cc = 0; cc < C/8; ++cc) {
|
for (short cc = 0; cc < C/8; ++cc) {
|
||||||
s8x8_t ms;
|
s8x8_t vs;
|
||||||
simdgroup_load(ms, ss + 8*cc, TS, 0, false);
|
simdgroup_load(vs, ss + 8*cc, TS, 0, false);
|
||||||
|
|
||||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||||
// we can read directly from global memory
|
// we can read directly from global memory
|
||||||
|
@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
|
||||||
v8x8_t mv;
|
v8x8_t mv;
|
||||||
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
||||||
|
|
||||||
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
|
simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (short ii = 0; ii < DV16; ii += 4) {
|
for (short ii = 0; ii < DV16; ii += 4) {
|
||||||
|
@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
|
||||||
v8x8_t mv;
|
v8x8_t mv;
|
||||||
|
|
||||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||||
|
|
||||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (ii + tx < DV16) {
|
if (ii + tx < DV16) {
|
||||||
|
@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
|
||||||
v8x8_t mv;
|
v8x8_t mv;
|
||||||
|
|
||||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||||
|
|
||||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3624,83 +3622,80 @@ kernel void kernel_flash_attn_ext(
|
||||||
}
|
}
|
||||||
|
|
||||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||||
for (short j = 0; j < Q; ++j) {
|
for (short j = tiisg; j < Q; j += NW) {
|
||||||
if (tiisg == 0) {
|
|
||||||
ss[j*TS + 0] = S[j];
|
ss[j*TS + 0] = S[j];
|
||||||
ss[j*TS + 1] = M[j];
|
ss[j*TS + 1] = M[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
|
threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||||
|
threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
|
||||||
|
|
||||||
|
// store result to shared memory in F32
|
||||||
|
if (sgitg == 0) {
|
||||||
|
for (short i = 0; i < DV8; ++i) {
|
||||||
|
//simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||||
|
simdgroup_float8x8 t(1.0f);
|
||||||
|
simdgroup_multiply(t, lo[i], t);
|
||||||
|
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// reduce the warps sequentially
|
// reduce the warps sequentially
|
||||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
|
|
||||||
// each simdgroup stores its output to shared memory, reusing sq
|
|
||||||
if (sgitg == sg) {
|
if (sgitg == sg) {
|
||||||
for (short i = 0; i < DV8; ++i) {
|
for (short j = tiisg; j < Q; j += NW) {
|
||||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
const float S0 = ss[j*TS - 1*SH + 0];
|
||||||
}
|
const float S1 = ss[j*TS + 0];
|
||||||
}
|
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
const float M0 = ss[j*TS - 1*SH + 1];
|
||||||
|
const float M1 = ss[j*TS + 1];
|
||||||
// the first simdgroup accumulates the results from the other simdgroups
|
|
||||||
if (sgitg == 0) {
|
|
||||||
for (short j = 0; j < Q; ++j) {
|
|
||||||
const float S0 = ss[j*TS + 0];
|
|
||||||
const float S1 = ss[j*TS + sg*SH + 0];
|
|
||||||
|
|
||||||
const float M0 = ss[j*TS + 1];
|
|
||||||
const float M1 = ss[j*TS + sg*SH + 1];
|
|
||||||
|
|
||||||
const float M = max(M0, M1);
|
const float M = max(M0, M1);
|
||||||
|
|
||||||
const float ms0 = exp(M0 - M);
|
float ms0 = exp(M0 - M);
|
||||||
const float ms1 = exp(M1 - M);
|
float ms1 = exp(M1 - M);
|
||||||
|
|
||||||
const float S = S0*ms0 + S1*ms1;
|
const float S = S0*ms0 + S1*ms1;
|
||||||
|
|
||||||
if (tiisg == 0) {
|
|
||||||
ss[j*TS + 0] = S;
|
ss[j*TS + 0] = S;
|
||||||
ss[j*TS + 1] = M;
|
ss[j*TS + 1] = M;
|
||||||
|
|
||||||
ss[j*TS + 2*C + j ] = ms0;
|
ss[j*TS + 2*C + j - 1*SH] = ms0;
|
||||||
ss[j*TS + 2*C + j + sg*SH] = ms1;
|
ss[j*TS + 2*C + j ] = ms1;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||||
{
|
{
|
||||||
s8x8_t ms0;
|
s8x8_t ms0;
|
||||||
s8x8_t ms1;
|
s8x8_t ms1;
|
||||||
|
|
||||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
|
||||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
simdgroup_load(ms1, ss + 2*C, TS, 0, false);
|
||||||
|
|
||||||
#pragma unroll(DV8)
|
#pragma unroll(DV8)
|
||||||
for (short i = 0; i < DV8; ++i) {
|
for (short i = 0; i < DV8; ++i) {
|
||||||
o8x8_t t;
|
simdgroup_float8x8 t;
|
||||||
|
|
||||||
simdgroup_load (t, so + i*8, DV, 0, false);
|
simdgroup_load (t, so + i*8, DV, 0, false);
|
||||||
simdgroup_multiply(t, ms1, t);
|
simdgroup_multiply(t, ms0, t);
|
||||||
|
|
||||||
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
|
simdgroup_multiply_accumulate(t, ms1, lo[i], t);
|
||||||
|
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// store result to shared memory (reuse sq)
|
|
||||||
if (sgitg == 0) {
|
|
||||||
for (short i = 0; i < DV8; ++i) {
|
|
||||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
}
|
||||||
|
|
||||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK);
|
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
|
||||||
|
|
||||||
// final rescale with 1/S and store to global memory
|
// final rescale with 1/S and store to global memory
|
||||||
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
|
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
|
||||||
|
@ -3723,8 +3718,8 @@ kernel void kernel_flash_attn_ext(
|
||||||
half, half4x4, simdgroup_half8x8, \
|
half, half4x4, simdgroup_half8x8, \
|
||||||
float, simdgroup_float8x8, \
|
float, simdgroup_float8x8, \
|
||||||
float, simdgroup_float8x8, \
|
float, simdgroup_float8x8, \
|
||||||
float, float4, simdgroup_float8x8
|
half, half4, simdgroup_half8x8
|
||||||
//half, half4, simdgroup_half8x8
|
//float, float4, simdgroup_float8x8
|
||||||
|
|
||||||
#define FA_TYPES_BF \
|
#define FA_TYPES_BF \
|
||||||
bfloat, bfloat4, simdgroup_bfloat8x8, \
|
bfloat, bfloat4, simdgroup_bfloat8x8, \
|
||||||
|
@ -3732,8 +3727,8 @@ kernel void kernel_flash_attn_ext(
|
||||||
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
||||||
float, simdgroup_float8x8, \
|
float, simdgroup_float8x8, \
|
||||||
float, simdgroup_float8x8, \
|
float, simdgroup_float8x8, \
|
||||||
float, float4, simdgroup_float8x8
|
half, half4, simdgroup_half8x8
|
||||||
//half, half4, simdgroup_half8x8
|
//float, float4, simdgroup_float8x8
|
||||||
|
|
||||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
||||||
|
|
||||||
|
|
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
|
@ -0,0 +1,283 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#ifdef cl_intel_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
|
||||||
|
typedef char int8_t;
|
||||||
|
typedef uchar uint8_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
typedef ushort uint16_t;
|
||||||
|
typedef int int32_t;
|
||||||
|
typedef uint uint32_t;
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// block_q4_0
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
struct block_q4_0
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK4_0 / 2];
|
||||||
|
};
|
||||||
|
|
||||||
|
// This function requires the original shuffled weights.
|
||||||
|
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||||
|
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||||
|
inline float block_q_4_0_dot_y_flat(
|
||||||
|
global uchar * x,
|
||||||
|
global half * dh,
|
||||||
|
float sumy,
|
||||||
|
float16 yl,
|
||||||
|
int il
|
||||||
|
) {
|
||||||
|
float d = *dh;
|
||||||
|
global ushort * qs = ((global ushort *)x + il/2);
|
||||||
|
float acc = 0.f;
|
||||||
|
|
||||||
|
acc += yl.s0 * (qs[0] & 0x000F);
|
||||||
|
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||||
|
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||||
|
acc += yl.s9 * (qs[0] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s2 * (qs[1] & 0x000F);
|
||||||
|
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||||
|
acc += yl.sa * (qs[1] & 0x00F0);
|
||||||
|
acc += yl.sb * (qs[1] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s4 * (qs[2] & 0x000F);
|
||||||
|
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||||
|
acc += yl.sc * (qs[2] & 0x00F0);
|
||||||
|
acc += yl.sd * (qs[2] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s6 * (qs[3] & 0x000F);
|
||||||
|
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||||
|
acc += yl.se * (qs[3] & 0x00F0);
|
||||||
|
acc += yl.sf * (qs[3] & 0xF000);
|
||||||
|
|
||||||
|
return d * (sumy * -8.f + acc);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// This variant outputs 8 values.
|
||||||
|
//
|
||||||
|
#undef N_DST
|
||||||
|
#undef N_SIMDGROUP
|
||||||
|
#undef N_SIMDWIDTH
|
||||||
|
|
||||||
|
#ifdef INTEL_GPU
|
||||||
|
#define N_DST 8 // each SIMD group works on 8 rows
|
||||||
|
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||||
|
#define N_SIMDWIDTH 16 // subgroup size
|
||||||
|
#elif defined (ADRENO_GPU)
|
||||||
|
#define N_DST 8
|
||||||
|
#define N_SIMDGROUP 1
|
||||||
|
#define N_SIMDWIDTH 64
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void mul_vec_q_n_f32_8x_flat(
|
||||||
|
global char * src0_q,
|
||||||
|
global half * src0_d,
|
||||||
|
global float * src1,
|
||||||
|
global float * dst,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne10,
|
||||||
|
int ne12,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
const ulong nb = ne00/QK4_0;
|
||||||
|
|
||||||
|
int r0 = get_group_id(0);
|
||||||
|
int r1 = get_group_id(1);
|
||||||
|
int im = 0;
|
||||||
|
|
||||||
|
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||||
|
|
||||||
|
int i12 = im%ne12;
|
||||||
|
int i13 = im/ne12;
|
||||||
|
|
||||||
|
// The number of scales is the same as the number of blocks.
|
||||||
|
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||||
|
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||||
|
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||||
|
|
||||||
|
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||||
|
global half * d = (global half *) src0_d + offset0_d;
|
||||||
|
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||||
|
|
||||||
|
float16 yl;
|
||||||
|
float8 sumf = 0.f;
|
||||||
|
|
||||||
|
int ix = get_sub_group_local_id()/2;
|
||||||
|
int il = 8*(get_sub_group_local_id()%2);
|
||||||
|
|
||||||
|
global float * yb = y + ix*QK4_0 + il;
|
||||||
|
|
||||||
|
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||||
|
float sumy = 0.f;
|
||||||
|
|
||||||
|
sumy += yb[0];
|
||||||
|
sumy += yb[1];
|
||||||
|
sumy += yb[2];
|
||||||
|
sumy += yb[3];
|
||||||
|
sumy += yb[4];
|
||||||
|
sumy += yb[5];
|
||||||
|
sumy += yb[6];
|
||||||
|
sumy += yb[7];
|
||||||
|
|
||||||
|
sumy += yb[16];
|
||||||
|
sumy += yb[17];
|
||||||
|
sumy += yb[18];
|
||||||
|
sumy += yb[19];
|
||||||
|
sumy += yb[20];
|
||||||
|
sumy += yb[21];
|
||||||
|
sumy += yb[22];
|
||||||
|
sumy += yb[23];
|
||||||
|
|
||||||
|
yl.s0 = yb[0];
|
||||||
|
yl.s1 = yb[1]/256.f;
|
||||||
|
|
||||||
|
yl.s2 = yb[2];
|
||||||
|
yl.s3 = yb[3]/256.f;
|
||||||
|
|
||||||
|
yl.s4 = yb[4];
|
||||||
|
yl.s5 = yb[5]/256.f;
|
||||||
|
|
||||||
|
yl.s6 = yb[6];
|
||||||
|
yl.s7 = yb[7]/256.f;
|
||||||
|
|
||||||
|
yl.s8 = yb[16]/16.f;
|
||||||
|
yl.s9 = yb[17]/4096.f;
|
||||||
|
|
||||||
|
yl.sa = yb[18]/16.f;
|
||||||
|
yl.sb = yb[19]/4096.f;
|
||||||
|
|
||||||
|
yl.sc = yb[20]/16.f;
|
||||||
|
yl.sd = yb[21]/4096.f;
|
||||||
|
|
||||||
|
yl.se = yb[22]/16.f;
|
||||||
|
yl.sf = yb[23]/4096.f;
|
||||||
|
|
||||||
|
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||||
|
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||||
|
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||||
|
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||||
|
|
||||||
|
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||||
|
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||||
|
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||||
|
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||||
|
|
||||||
|
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
float8 tot = (float8)(
|
||||||
|
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||||
|
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||||
|
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||||
|
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (get_sub_group_local_id() == 0) {
|
||||||
|
if (first_row + 0 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||||
|
}
|
||||||
|
if (first_row + 1 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||||
|
}
|
||||||
|
if (first_row + 2 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||||
|
}
|
||||||
|
if (first_row + 3 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (first_row + 4 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||||
|
}
|
||||||
|
if (first_row + 5 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||||
|
}
|
||||||
|
if (first_row + 6 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||||
|
}
|
||||||
|
if (first_row + 7 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef INTEL_GPU
|
||||||
|
REQD_SUBGROUP_SIZE_16
|
||||||
|
#elif defined (ADRENO_GPU)
|
||||||
|
REQD_SUBGROUP_SIZE_64
|
||||||
|
#endif
|
||||||
|
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||||
|
global char * src0_q,
|
||||||
|
global half * src0_d,
|
||||||
|
global float * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * src2,
|
||||||
|
ulong offset2,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb02,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
int ne20,
|
||||||
|
int ne21,
|
||||||
|
ulong nb21,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
src1 = (global float *)((global char *)src1 + offset1);
|
||||||
|
src2 = (global char *)((global char *)src2 + offset2);
|
||||||
|
dst = (global float *)((global char *)dst + offsetd);
|
||||||
|
|
||||||
|
const int iid1 = get_group_id(2)/ne20;
|
||||||
|
const int idx = get_group_id(2)%ne20;
|
||||||
|
|
||||||
|
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||||
|
|
||||||
|
const int i11 = idx%ne11;
|
||||||
|
const int i12 = iid1;
|
||||||
|
|
||||||
|
const int i1 = idx;
|
||||||
|
const int i2 = i12;
|
||||||
|
|
||||||
|
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||||
|
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||||
|
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||||
|
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||||
|
|
||||||
|
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||||
|
}
|
File diff suppressed because it is too large
Load diff
|
@ -333,7 +333,9 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
|
||||||
|
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
|
||||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||||
|
@ -370,7 +372,7 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
|
||||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||||
|
|
|
@ -250,22 +250,6 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
|
||||||
GGML_UNUSED(ubatch);
|
|
||||||
|
|
||||||
const int64_t n_kv = kv_state->get_n_kv();
|
|
||||||
|
|
||||||
if (s_mask) {
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
|
|
||||||
float * data = (float *) s_mask->data;
|
|
||||||
|
|
||||||
// clear unused states
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
|
||||||
data[i] = kv_state->s_mask(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
GGML_UNUSED(ubatch);
|
GGML_UNUSED(ubatch);
|
||||||
|
|
||||||
|
@ -650,6 +634,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
{
|
{
|
||||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||||
int64_t split_point = cur->ne[0] / 2;
|
int64_t split_point = cur->ne[0] / 2;
|
||||||
|
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||||
|
|
||||||
|
@ -663,7 +648,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
{
|
{
|
||||||
// Split into two equal parts
|
// Split into two equal parts
|
||||||
int64_t split_point = cur->ne[0] / 2;
|
int64_t split_point = cur->ne[0] / 2;
|
||||||
// TODO: these conts should not be needed
|
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||||
|
|
||||||
|
@ -986,23 +971,6 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
|
||||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
|
|
||||||
|
|
||||||
const auto n_kv = kv_state->get_n_kv();
|
|
||||||
|
|
||||||
auto & cur = inp->s_mask;
|
|
||||||
|
|
||||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
|
|
||||||
ggml_set_input(cur);
|
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
|
||||||
|
|
||||||
return cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
|
auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
|
||||||
|
|
||||||
|
@ -1455,43 +1423,53 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_copy_mask_state(
|
ggml_tensor * llm_graph_context::build_recurrent_state(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * s,
|
ggml_tensor * s,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
int32_t state_size,
|
||||||
int32_t n_state,
|
int32_t n_seqs,
|
||||||
int32_t n_seqs) const {
|
bool avoid_copies) const {
|
||||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||||
|
|
||||||
const auto n_kv = kv_state->get_n_kv();
|
const auto n_kv = kv_state->get_n_kv();
|
||||||
const auto kv_head = kv_state->get_head();
|
const auto kv_head = kv_state->get_head();
|
||||||
|
const auto rs_zero = kv_state->get_rs_z();
|
||||||
|
|
||||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
|
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_state->get_size());
|
||||||
|
|
||||||
|
// Clear a single state which will then be copied to the other cleared states.
|
||||||
|
// Note that this is a no-op when the view is zero-sized.
|
||||||
|
ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
|
||||||
|
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
||||||
|
|
||||||
|
ggml_tensor * output_states;
|
||||||
|
|
||||||
|
if (!avoid_copies) {
|
||||||
// copy states
|
// copy states
|
||||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
||||||
// this shrinks the tensors's ne[1] to n_kv
|
// {state_size, kv_size} -> {state_size, n_seqs}
|
||||||
states = ggml_get_rows(ctx0, states, state_copy);
|
output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
|
||||||
|
ggml_build_forward_expand(gf, output_states);
|
||||||
|
} else {
|
||||||
|
// FIXME: make the gathering operation happen before the copy below
|
||||||
|
// (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
|
||||||
|
output_states = states;
|
||||||
|
}
|
||||||
|
|
||||||
// clear states of sequences which are starting at the beginning of this batch
|
// copy extra states which won't be changed further (between n_seqs and n_kv)
|
||||||
// FIXME: zero-out NANs?
|
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
|
||||||
states = ggml_mul(ctx0, states, state_mask);
|
|
||||||
|
|
||||||
// copy states which won't be changed further (between n_seqs and n_kv)
|
|
||||||
ggml_build_forward_expand(gf,
|
ggml_build_forward_expand(gf,
|
||||||
ggml_cpy(ctx0,
|
ggml_cpy(ctx0,
|
||||||
ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)),
|
states_extra,
|
||||||
ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
|
ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
|
||||||
|
|
||||||
// the part of the states that will be used and modified
|
return output_states;
|
||||||
return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||||
|
@ -1502,8 +1480,8 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||||
|
|
||||||
ggml_tensor * token_shift_all = kv_state->get_k_l(il);
|
ggml_tensor * token_shift_all = kv_state->get_k_l(il);
|
||||||
|
|
||||||
ggml_tensor * token_shift = build_copy_mask_state(
|
ggml_tensor * token_shift = build_recurrent_state(
|
||||||
gf, token_shift_all, state_copy, state_mask,
|
gf, token_shift_all, state_copy,
|
||||||
hparams.n_embd_k_s(), n_seqs);
|
hparams.n_embd_k_s(), n_seqs);
|
||||||
|
|
||||||
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
||||||
|
|
|
@ -200,18 +200,6 @@ public:
|
||||||
const llama_kv_cache_recurrent_state * kv_state;
|
const llama_kv_cache_recurrent_state * kv_state;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
|
||||||
public:
|
|
||||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
|
|
||||||
virtual ~llm_graph_input_s_mask() = default;
|
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
|
||||||
|
|
||||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
|
||||||
|
|
||||||
const llama_kv_cache_recurrent_state * kv_state;
|
|
||||||
};
|
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_cross_embd(
|
llm_graph_input_cross_embd(
|
||||||
|
@ -521,7 +509,6 @@ struct llm_graph_context {
|
||||||
ggml_tensor * build_inp_mean() const;
|
ggml_tensor * build_inp_mean() const;
|
||||||
ggml_tensor * build_inp_cls() const;
|
ggml_tensor * build_inp_cls() const;
|
||||||
ggml_tensor * build_inp_s_copy() const;
|
ggml_tensor * build_inp_s_copy() const;
|
||||||
ggml_tensor * build_inp_s_mask() const;
|
|
||||||
|
|
||||||
ggml_tensor * build_inp_cross_embd() const;
|
ggml_tensor * build_inp_cross_embd() const;
|
||||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||||
|
@ -606,18 +593,17 @@ struct llm_graph_context {
|
||||||
// recurrent
|
// recurrent
|
||||||
//
|
//
|
||||||
|
|
||||||
ggml_tensor * build_copy_mask_state(
|
ggml_tensor * build_recurrent_state(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * s,
|
ggml_tensor * s,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
int32_t state_size,
|
||||||
int32_t n_state,
|
int32_t n_seqs,
|
||||||
int32_t n_seqs) const;
|
bool avoid_copies = false) const;
|
||||||
|
|
||||||
ggml_tensor * build_rwkv_token_shift_load(
|
ggml_tensor * build_rwkv_token_shift_load(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
|
|
|
@ -406,21 +406,12 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
// TODO: here we have to verify that all ubatches can fit in the cells
|
for (const auto & ubatch : ubatches) {
|
||||||
// however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
|
if (!find_slot(ubatch)) {
|
||||||
// during the compute of each ubatch. to reproduce, uncomment the following loop and run:
|
success = false;
|
||||||
//
|
break;
|
||||||
// $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
|
}
|
||||||
//
|
}
|
||||||
// recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
|
|
||||||
//
|
|
||||||
GGML_UNUSED(ubatches);
|
|
||||||
//for (const auto & ubatch : ubatches) {
|
|
||||||
// if (!find_slot(ubatch)) {
|
|
||||||
// success = false;
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
// restore the original state
|
// restore the original state
|
||||||
cells = std::move(org_cells);
|
cells = std::move(org_cells);
|
||||||
|
@ -431,14 +422,13 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
const uint32_t n_tokens = ubatch.n_tokens;
|
|
||||||
const uint32_t n_seqs = ubatch.n_seqs;
|
const uint32_t n_seqs = ubatch.n_seqs;
|
||||||
|
|
||||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
|
||||||
// if we have enough unused cells before the current head ->
|
// if we have enough unused cells before the current head ->
|
||||||
// better to start searching from the beginning of the cache, hoping to fill it
|
// better to start searching from the beginning of the cache, hoping to fill it
|
||||||
if (head > used + 2*n_tokens) {
|
if (head > used + 2*n_seqs) {
|
||||||
head = 0;
|
head = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -534,16 +524,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
empty_cell.src = orig_cell.src;
|
empty_cell.src = orig_cell.src;
|
||||||
orig_cell.seq_id.erase(seq_id);
|
orig_cell.seq_id.erase(seq_id);
|
||||||
empty_cell.seq_id.insert(seq_id); // will be overwritten
|
empty_cell.seq_id.insert(seq_id); // will be overwritten
|
||||||
|
GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
|
||||||
}
|
}
|
||||||
seq_meta.tail = next_empty_cell;
|
seq_meta.tail = next_empty_cell;
|
||||||
// find next empty cell
|
// find next empty cell
|
||||||
if (s + 1 < n_seqs) {
|
if (s + 1 < n_seqs) {
|
||||||
next_empty_cell += 1;
|
|
||||||
for (uint32_t i = 0; i < size; ++i) {
|
for (uint32_t i = 0; i < size; ++i) {
|
||||||
|
next_empty_cell += 1;
|
||||||
if (next_empty_cell >= size) { next_empty_cell -= size; }
|
if (next_empty_cell >= size) { next_empty_cell -= size; }
|
||||||
kv_cell & cell = cells[next_empty_cell];
|
kv_cell & cell = cells[next_empty_cell];
|
||||||
if (cell.is_empty()) { break; }
|
if (cell.is_empty()) { break; }
|
||||||
next_empty_cell += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -553,8 +543,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
|
|
||||||
// gather and re-order
|
// gather and re-order
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
int32_t dst_id = s + min;
|
const int32_t dst_id = s + min;
|
||||||
int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
||||||
if (dst_id != src_id) {
|
if (dst_id != src_id) {
|
||||||
kv_cell & dst_cell = cells[dst_id];
|
kv_cell & dst_cell = cells[dst_id];
|
||||||
kv_cell & src_cell = cells[src_id];
|
kv_cell & src_cell = cells[src_id];
|
||||||
|
@ -563,12 +553,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
std::swap(dst_cell.src, src_cell.src);
|
std::swap(dst_cell.src, src_cell.src);
|
||||||
std::swap(dst_cell.seq_id, src_cell.seq_id);
|
std::swap(dst_cell.seq_id, src_cell.seq_id);
|
||||||
|
|
||||||
// swap tails (assuming they NEVER overlap)
|
// swap tails
|
||||||
for (const llama_seq_id seq_id : src_cell.seq_id) {
|
for (uint32_t i = 0; i < size; ++i) {
|
||||||
cells[seq_id].tail = src_id;
|
int32_t & tail = cells[i].tail;
|
||||||
|
if (tail == src_id) {
|
||||||
|
tail = dst_id;
|
||||||
|
} else if (tail == dst_id) {
|
||||||
|
tail = src_id;
|
||||||
}
|
}
|
||||||
for (const llama_seq_id seq_id : dst_cell.seq_id) {
|
|
||||||
cells[seq_id].tail = dst_id;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -576,7 +568,7 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
// update the pos of the used seqs
|
// update the pos of the used seqs
|
||||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||||
int32_t cell_id = s + min;
|
const int32_t cell_id = s + min;
|
||||||
kv_cell & cell = cells[cell_id];
|
kv_cell & cell = cells[cell_id];
|
||||||
|
|
||||||
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
|
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
|
||||||
|
@ -594,6 +586,38 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Find first cell without src refs, to use as the zero-ed state
|
||||||
|
{
|
||||||
|
// TODO: bake-in src refcounts in the cell metadata
|
||||||
|
std::vector<int32_t> refcounts(size, 0);
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
const int32_t src = cells[i].src;
|
||||||
|
if (src >= 0) {
|
||||||
|
refcounts[src] += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rs_z = -1;
|
||||||
|
for (int i = min; i <= max; ++i) {
|
||||||
|
if (refcounts[i] == 0) {
|
||||||
|
rs_z = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = min; i <= max; ++i) {
|
||||||
|
if (cells[i].src < 0) {
|
||||||
|
GGML_ASSERT(rs_z >= 0);
|
||||||
|
cells[i].src0 = rs_z;
|
||||||
|
} else {
|
||||||
|
// Stage the source ids for all used cells to allow correct seq_* behavior
|
||||||
|
// and still make these values available when setting the inputs
|
||||||
|
cells[i].src0 = cells[i].src;
|
||||||
|
}
|
||||||
|
cells[i].src = i; // avoid moving or clearing twice
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// allow getting the range of used cells, from head to head + n
|
// allow getting the range of used cells, from head to head + n
|
||||||
head = min;
|
head = min;
|
||||||
n = max - min + 1;
|
n = max - min + 1;
|
||||||
|
@ -605,47 +629,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_recurrent::get_can_shift() const {
|
bool llama_kv_cache_recurrent::get_can_shift() const {
|
||||||
return false;
|
// shifting the pos is trivial for recurrent models
|
||||||
}
|
return true;
|
||||||
|
|
||||||
int32_t llama_kv_cache_recurrent::s_copy(int i) const {
|
|
||||||
const uint32_t cell_id = i + head;
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
|
||||||
|
|
||||||
// prevent out-of-bound sources
|
|
||||||
if (cell.src < 0 || (uint32_t) cell.src >= size) {
|
|
||||||
cell.src = cell_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t res = cell.src;
|
|
||||||
|
|
||||||
// TODO: do not mutate the KV cache
|
|
||||||
// ensure copy only happens once
|
|
||||||
if (cell.src != (int32_t) cell_id) {
|
|
||||||
cell.src = cell_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
float llama_kv_cache_recurrent::s_mask(int i) const {
|
|
||||||
const uint32_t cell_id = i + head;
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
|
||||||
|
|
||||||
float res = (float) (cell.src >= 0);
|
|
||||||
|
|
||||||
// only clear once
|
|
||||||
if (cell.src < 0) {
|
|
||||||
cell.src = cell_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_kv_cache_recurrent::total_size() const {
|
size_t llama_kv_cache_recurrent::total_size() const {
|
||||||
|
@ -1111,6 +1096,10 @@ uint32_t llama_kv_cache_recurrent_state::get_head() const {
|
||||||
return is_full ? 0 : kv->head;
|
return is_full ? 0 : kv->head;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t llama_kv_cache_recurrent_state::get_rs_z() const {
|
||||||
|
return is_full ? 0 : kv->rs_z;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t llama_kv_cache_recurrent_state::get_size() const {
|
uint32_t llama_kv_cache_recurrent_state::get_size() const {
|
||||||
return kv->size;
|
return kv->size;
|
||||||
}
|
}
|
||||||
|
@ -1124,9 +1113,5 @@ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
|
int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
|
||||||
return kv->s_copy(i);
|
return kv->cells[i + kv->head].src0;
|
||||||
}
|
|
||||||
|
|
||||||
float llama_kv_cache_recurrent_state::s_mask(int i) const {
|
|
||||||
return kv->s_mask(i);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,10 +57,6 @@ public:
|
||||||
|
|
||||||
bool get_can_shift() const override;
|
bool get_can_shift() const override;
|
||||||
|
|
||||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
|
||||||
int32_t s_copy(int i) const;
|
|
||||||
float s_mask(int i) const;
|
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||||
|
@ -73,10 +69,14 @@ public:
|
||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
|
|
||||||
|
// first zero-ed state
|
||||||
|
int32_t rs_z = -1;
|
||||||
|
|
||||||
// TODO: optimize for recurrent state needs
|
// TODO: optimize for recurrent state needs
|
||||||
struct kv_cell {
|
struct kv_cell {
|
||||||
llama_pos pos = -1;
|
llama_pos pos = -1;
|
||||||
int32_t src = -1; // used to copy states
|
int32_t src = -1; // used to know where states should be copied from
|
||||||
|
int32_t src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
|
||||||
int32_t tail = -1;
|
int32_t tail = -1;
|
||||||
|
|
||||||
std::set<llama_seq_id> seq_id;
|
std::set<llama_seq_id> seq_id;
|
||||||
|
@ -157,13 +157,13 @@ public:
|
||||||
|
|
||||||
uint32_t get_n_kv() const;
|
uint32_t get_n_kv() const;
|
||||||
uint32_t get_head() const;
|
uint32_t get_head() const;
|
||||||
|
int32_t get_rs_z() const;
|
||||||
uint32_t get_size() const;
|
uint32_t get_size() const;
|
||||||
|
|
||||||
ggml_tensor * get_k_l(int32_t il) const;
|
ggml_tensor * get_k_l(int32_t il) const;
|
||||||
ggml_tensor * get_v_l(int32_t il) const;
|
ggml_tensor * get_v_l(int32_t il) const;
|
||||||
|
|
||||||
int32_t s_copy(int i) const;
|
int32_t s_copy(int i) const;
|
||||||
float s_mask(int i) const;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const llama_memory_status status;
|
const llama_memory_status status;
|
||||||
|
|
|
@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||||
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::clear(bool data) {
|
void llama_kv_cache_unified::clear(bool data) {
|
||||||
|
@ -462,7 +465,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
||||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
assert(dinfo.ids[i] <= n_kv);
|
assert(dinfo.ids[i] <= n_kv);
|
||||||
|
|
||||||
if (dinfo.ids[i] == n_kv) {
|
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -512,21 +515,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
head_cur = 0;
|
head_cur = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// otherwise, one cell per token.
|
|
||||||
|
|
||||||
if (n_tokens > cells.size()) {
|
if (n_tokens > cells.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//#define FIND_SLOT_DEBUG 1
|
if (debug > 0) {
|
||||||
#if FIND_SLOT_DEBUG
|
LLAMA_LOG_CONT("\n");
|
||||||
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
|
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
|
||||||
|
|
||||||
// for debugging
|
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||||
{
|
|
||||||
std::string ss;
|
std::string ss;
|
||||||
if (n_swa > 0) {
|
|
||||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||||
if (cells.is_empty(i)) {
|
if (cells.is_empty(i)) {
|
||||||
ss += '.';
|
ss += '.';
|
||||||
|
@ -534,11 +533,35 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
ss += std::to_string(cells.seq_get(i));
|
ss += std::to_string(cells.seq_get(i));
|
||||||
}
|
}
|
||||||
if (i%256 == 255) {
|
if (i%256 == 255) {
|
||||||
|
ss += " *";
|
||||||
ss += '\n';
|
ss += '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||||
}
|
}
|
||||||
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
|
||||||
|
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||||
|
std::string ss;
|
||||||
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||||
|
std::string cur;
|
||||||
|
if (cells.is_empty(i)) {
|
||||||
|
cur = '.';
|
||||||
|
} else {
|
||||||
|
cur = std::to_string(cells.pos_get(i));
|
||||||
|
}
|
||||||
|
const int n = cur.size();
|
||||||
|
for (int j = 0; j < 5 - n; ++j) {
|
||||||
|
cur += ' ';
|
||||||
|
}
|
||||||
|
ss += cur;
|
||||||
|
if (i%256 == 255) {
|
||||||
|
ss += " *";
|
||||||
|
}
|
||||||
|
if (i%64 == 63) {
|
||||||
|
ss += '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||||
|
@ -546,9 +569,9 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
uint32_t n_tested = 0;
|
uint32_t n_tested = 0;
|
||||||
|
|
||||||
|
@ -559,21 +582,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep track of what the minimum sequence positions would be if we accept the ubatch
|
|
||||||
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
|
||||||
seq_pos_min[s] = cells.seq_pos_min(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool found = true;
|
bool found = true;
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
const llama_pos pos = ubatch.pos[i];
|
//const llama_pos pos = ubatch.pos[i];
|
||||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||||
|
|
||||||
// can we use this cell? either:
|
// can we use this cell? either:
|
||||||
// - the cell is empty
|
// - the cell is empty
|
||||||
// - the cell is occupied only by one sequence:
|
// - the cell is occupied only by one sequence:
|
||||||
// - mask causally, if the sequence is the same as the one we are inserting
|
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
|
||||||
// - mask SWA, using current max pos for that sequence in the cache
|
// - mask SWA, using current max pos for that sequence in the cache
|
||||||
// always insert in the cell with minimum pos
|
// always insert in the cell with minimum pos
|
||||||
bool can_use = cells.is_empty(head_cur + i);
|
bool can_use = cells.is_empty(head_cur + i);
|
||||||
|
@ -581,21 +598,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
||||||
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
||||||
|
|
||||||
// causal mask
|
// (disabled) causal mask
|
||||||
if (cells.seq_has(head_cur + i, seq_id)) {
|
// note: it's better to purge any "future" tokens beforehand
|
||||||
can_use = pos_cell >= pos;
|
//if (cells.seq_has(head_cur + i, seq_id)) {
|
||||||
}
|
// can_use = pos_cell >= pos;
|
||||||
|
//}
|
||||||
|
|
||||||
if (!can_use) {
|
if (!can_use) {
|
||||||
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
||||||
|
|
||||||
// SWA mask
|
// SWA mask
|
||||||
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
|
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||||
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
|
||||||
if (pos_cell == seq_pos_min[seq_id_cell] &&
|
|
||||||
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
||||||
seq_pos_min[seq_id_cell]++;
|
|
||||||
can_use = true;
|
can_use = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -623,8 +636,22 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
||||||
|
// keep track of the max sequence position that we would overwrite with this ubatch
|
||||||
|
// for non-SWA cache, this would be always empty
|
||||||
|
llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||||
|
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||||
|
seq_pos_max_rm[s] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||||
if (!cells.is_empty(head_cur + i)) {
|
if (!cells.is_empty(head_cur + i)) {
|
||||||
|
assert(cells.seq_count(head_cur + i) == 1);
|
||||||
|
|
||||||
|
const llama_seq_id seq_id = cells.seq_get(head_cur + i);
|
||||||
|
const llama_pos pos = cells.pos_get(head_cur + i);
|
||||||
|
|
||||||
|
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
|
||||||
|
|
||||||
cells.rm(head_cur + i);
|
cells.rm(head_cur + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -635,6 +662,22 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
|
||||||
|
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||||
|
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||||
|
if (seq_pos_max_rm[s] == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
|
||||||
|
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
|
||||||
|
|
||||||
|
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// move the head at the end of the slot
|
// move the head at the end of the slot
|
||||||
head = head_cur + ubatch.n_tokens;
|
head = head_cur + ubatch.n_tokens;
|
||||||
}
|
}
|
||||||
|
@ -944,11 +987,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||||
|
|
||||||
//GGML_ASSERT(kv_self->size == n_ctx);
|
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||||
|
|
||||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
|
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
|
||||||
ggml_set_input(inp->k_shift);
|
ggml_set_input(inp->k_shift);
|
||||||
|
|
||||||
for (const auto & layer : layers) {
|
for (const auto & layer : layers) {
|
||||||
|
|
|
@ -158,6 +158,8 @@ private:
|
||||||
// SWA
|
// SWA
|
||||||
const uint32_t n_swa = 0;
|
const uint32_t n_swa = 0;
|
||||||
|
|
||||||
|
int debug = 0;
|
||||||
|
|
||||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
|
|
@ -80,6 +80,9 @@ public:
|
||||||
assert(isrc < pos.size());
|
assert(isrc < pos.size());
|
||||||
assert(idst < pos.size());
|
assert(idst < pos.size());
|
||||||
|
|
||||||
|
assert(pos[idst] == -1);
|
||||||
|
assert(pos[isrc] != -1);
|
||||||
|
|
||||||
pos [idst] = pos [isrc];
|
pos [idst] = pos [isrc];
|
||||||
shift[idst] = shift[isrc];
|
shift[idst] = shift[isrc];
|
||||||
seq [idst] = seq [isrc];
|
seq [idst] = seq [isrc];
|
||||||
|
@ -144,9 +147,10 @@ public:
|
||||||
assert(pos[i] != -1);
|
assert(pos[i] != -1);
|
||||||
|
|
||||||
seq_pos_rm(i);
|
seq_pos_rm(i);
|
||||||
|
seq[i].reset();
|
||||||
|
|
||||||
pos[i] = -1;
|
pos[i] = -1;
|
||||||
seq[i].reset();
|
shift[i] = 0;
|
||||||
|
|
||||||
used.erase(i);
|
used.erase(i);
|
||||||
}
|
}
|
||||||
|
@ -164,6 +168,7 @@ public:
|
||||||
|
|
||||||
if (seq[i].none()) {
|
if (seq[i].none()) {
|
||||||
pos[i] = -1;
|
pos[i] = -1;
|
||||||
|
shift[i] = 0;
|
||||||
|
|
||||||
used.erase(i);
|
used.erase(i);
|
||||||
|
|
||||||
|
@ -192,6 +197,7 @@ public:
|
||||||
seq[i].reset();
|
seq[i].reset();
|
||||||
|
|
||||||
pos[i] = -1;
|
pos[i] = -1;
|
||||||
|
shift[i] = 0;
|
||||||
|
|
||||||
used.erase(i);
|
used.erase(i);
|
||||||
|
|
||||||
|
@ -317,21 +323,20 @@ public:
|
||||||
pos[i] += d;
|
pos[i] += d;
|
||||||
shift[i] += d;
|
shift[i] += d;
|
||||||
|
|
||||||
seq_pos_add(i);
|
|
||||||
|
|
||||||
has_shift = true;
|
has_shift = true;
|
||||||
|
|
||||||
if (pos[i] < 0) {
|
if (pos[i] < 0) {
|
||||||
seq_pos_rm(i);
|
|
||||||
|
|
||||||
seq[i].reset();
|
seq[i].reset();
|
||||||
pos[i] = -1;
|
pos[i] = -1;
|
||||||
|
shift[i] = 0;
|
||||||
|
|
||||||
used.erase(i);
|
used.erase(i);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seq_pos_add(i);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2320,8 +2320,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
||||||
|
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||||
|
@ -6143,7 +6143,7 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
model.layers[il].ffn_gate, NULL, NULL,
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
} else {
|
} else {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
|
@ -8957,7 +8957,6 @@ struct llm_build_mamba : public llm_graph_context {
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
ggml_tensor * state_copy = build_inp_s_copy();
|
ggml_tensor * state_copy = build_inp_s_copy();
|
||||||
ggml_tensor * state_mask = build_inp_s_mask();
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// norm
|
// norm
|
||||||
|
@ -8966,8 +8965,7 @@ struct llm_build_mamba : public llm_graph_context {
|
||||||
LLM_NORM_RMS, il);
|
LLM_NORM_RMS, il);
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
//cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
|
cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
|
||||||
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
// skip computing output for unused tokens
|
// skip computing output for unused tokens
|
||||||
|
@ -9008,7 +9006,6 @@ struct llm_build_mamba : public llm_graph_context {
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||||
|
@ -9035,12 +9032,12 @@ struct llm_build_mamba : public llm_graph_context {
|
||||||
ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
|
ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
|
||||||
|
|
||||||
// (ab)using the KV cache to store the states
|
// (ab)using the KV cache to store the states
|
||||||
ggml_tensor * conv = build_copy_mask_state(
|
ggml_tensor * conv = build_recurrent_state(
|
||||||
gf, conv_states_all, state_copy, state_mask,
|
gf, conv_states_all, state_copy,
|
||||||
hparams.n_embd_k_s(), n_seqs);
|
hparams.n_embd_k_s(), n_seqs);
|
||||||
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
||||||
ggml_tensor * ssm = build_copy_mask_state(
|
ggml_tensor * ssm = build_recurrent_state(
|
||||||
gf, ssm_states_all, state_copy, state_mask,
|
gf, ssm_states_all, state_copy,
|
||||||
hparams.n_embd_v_s(), n_seqs);
|
hparams.n_embd_v_s(), n_seqs);
|
||||||
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
||||||
|
|
||||||
|
@ -11756,7 +11753,6 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * x_prev,
|
ggml_tensor * x_prev,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||||
|
@ -11880,8 +11876,8 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||||
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
ggml_tensor * wkv_state = build_recurrent_state(
|
||||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
gf, kv_state->get_v_l(il), state_copy,
|
||||||
hparams.n_embd_v_s(), n_seqs);
|
hparams.n_embd_v_s(), n_seqs);
|
||||||
|
|
||||||
ggml_tensor * wkv_output;
|
ggml_tensor * wkv_output;
|
||||||
|
@ -11937,7 +11933,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||||
|
|
||||||
ggml_tensor * state_copy = build_inp_s_copy();
|
ggml_tensor * state_copy = build_inp_s_copy();
|
||||||
ggml_tensor * state_mask = build_inp_s_mask();
|
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
@ -11948,7 +11943,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||||
gf, state_copy, state_mask, ubatch, il
|
gf, state_copy, ubatch, il
|
||||||
);
|
);
|
||||||
|
|
||||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||||
|
@ -11964,7 +11959,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||||
|
|
||||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
@ -12035,7 +12030,6 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
ggml_tensor * state_copy = build_inp_s_copy();
|
ggml_tensor * state_copy = build_inp_s_copy();
|
||||||
ggml_tensor * state_mask = build_inp_s_mask();
|
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
@ -12046,7 +12040,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||||
gf, state_copy, state_mask, ubatch, il
|
gf, state_copy, ubatch, il
|
||||||
);
|
);
|
||||||
|
|
||||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||||
|
@ -12059,7 +12053,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||||
|
|
||||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||||
|
@ -12151,7 +12145,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * x_prev,
|
ggml_tensor * x_prev,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
|
||||||
ggml_tensor *& first_layer_value,
|
ggml_tensor *& first_layer_value,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
|
@ -12234,8 +12227,8 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||||
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
||||||
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
||||||
|
|
||||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
ggml_tensor * wkv_state = build_recurrent_state(
|
||||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
gf, kv_state->get_v_l(il), state_copy,
|
||||||
hparams.n_embd_v_s(), n_seqs);
|
hparams.n_embd_v_s(), n_seqs);
|
||||||
|
|
||||||
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
||||||
|
@ -12293,7 +12286,6 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||||
|
|
||||||
ggml_tensor * state_copy = build_inp_s_copy();
|
ggml_tensor * state_copy = build_inp_s_copy();
|
||||||
ggml_tensor * state_mask = build_inp_s_mask();
|
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
@ -12304,7 +12296,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||||
gf, state_copy, state_mask, ubatch, il
|
gf, state_copy, ubatch, il
|
||||||
);
|
);
|
||||||
|
|
||||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||||
|
@ -12320,7 +12312,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||||
|
|
||||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
@ -12387,7 +12379,6 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
ggml_tensor * state_copy = build_inp_s_copy();
|
ggml_tensor * state_copy = build_inp_s_copy();
|
||||||
ggml_tensor * state_mask = build_inp_s_mask();
|
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
@ -12398,7 +12389,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||||
gf, state_copy, state_mask, ubatch, il
|
gf, state_copy, ubatch, il
|
||||||
);
|
);
|
||||||
|
|
||||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||||
|
@ -12411,7 +12402,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
|
|
||||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||||
|
|
||||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||||
|
|
36
tests/test-tokenizers-repo.sh
Executable file
36
tests/test-tokenizers-repo.sh
Executable file
|
@ -0,0 +1,36 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -lt 2 ]; then
|
||||||
|
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $# -eq 3 ]; then
|
||||||
|
toktest=$3
|
||||||
|
else
|
||||||
|
toktest="./test-tokenizer-0"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -x $toktest ]; then
|
||||||
|
printf "Test executable \"$toktest\" not found!\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
repo=$1
|
||||||
|
folder=$2
|
||||||
|
|
||||||
|
if [ -d $folder ] && [ -d $folder/.git ]; then
|
||||||
|
(cd $folder; git pull)
|
||||||
|
else
|
||||||
|
git clone $repo $folder
|
||||||
|
fi
|
||||||
|
|
||||||
|
shopt -s globstar
|
||||||
|
for gguf in $folder/**/*.gguf; do
|
||||||
|
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
|
||||||
|
$toktest $gguf
|
||||||
|
else
|
||||||
|
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
Binary file not shown.
|
@ -233,6 +233,7 @@ struct server_task {
|
||||||
slot_params defaults;
|
slot_params defaults;
|
||||||
defaults.sampling = params_base.sampling;
|
defaults.sampling = params_base.sampling;
|
||||||
defaults.speculative = params_base.speculative;
|
defaults.speculative = params_base.speculative;
|
||||||
|
defaults.n_keep = params_base.n_keep;
|
||||||
|
|
||||||
// enabling this will output extra debug information in the HTTP responses from the server
|
// enabling this will output extra debug information in the HTTP responses from the server
|
||||||
params.verbose = params_base.verbosity > 9;
|
params.verbose = params_base.verbosity > 9;
|
||||||
|
@ -2060,6 +2061,7 @@ struct server_context {
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
slot.params.sampling = params_base.sampling;
|
slot.params.sampling = params_base.sampling;
|
||||||
|
slot.params.n_keep = params_base.n_keep;
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
queue_tasks.pop_deferred_task();
|
queue_tasks.pop_deferred_task();
|
||||||
|
@ -3556,9 +3558,6 @@ struct server_context {
|
||||||
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
|
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
|
||||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
||||||
|
|
||||||
// keep track of total number of tokens generated in the draft
|
|
||||||
slot.n_draft_total += draft.size();
|
|
||||||
|
|
||||||
// ignore small drafts
|
// ignore small drafts
|
||||||
if (slot.params.speculative.n_min > (int) draft.size()) {
|
if (slot.params.speculative.n_min > (int) draft.size()) {
|
||||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
||||||
|
@ -3566,6 +3565,9 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// keep track of total number of drafted tokens tested
|
||||||
|
slot.n_draft_total += draft.size();
|
||||||
|
|
||||||
// construct the speculation batch
|
// construct the speculation batch
|
||||||
common_batch_clear(slot.batch_spec);
|
common_batch_clear(slot.batch_spec);
|
||||||
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
|
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
|
||||||
|
@ -3584,7 +3586,7 @@ struct server_context {
|
||||||
slot.n_past += ids.size();
|
slot.n_past += ids.size();
|
||||||
slot.n_decoded += ids.size();
|
slot.n_decoded += ids.size();
|
||||||
|
|
||||||
// update how many tokens out of draft was accepted
|
// update how many tokens out of those tested were accepted
|
||||||
slot.n_draft_accepted += ids.size() - 1;
|
slot.n_draft_accepted += ids.size() - 1;
|
||||||
|
|
||||||
slot.cache_tokens.push_back(id);
|
slot.cache_tokens.push_back(id);
|
||||||
|
|
|
@ -41,6 +41,10 @@ html {
|
||||||
max-width: 900px;
|
max-width: 900px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.chat-bubble {
|
||||||
|
@apply break-words;
|
||||||
|
}
|
||||||
|
|
||||||
.chat-bubble-base-300 {
|
.chat-bubble-base-300 {
|
||||||
--tw-bg-opacity: 1;
|
--tw-bg-opacity: 1;
|
||||||
--tw-text-opacity: 1;
|
--tw-text-opacity: 1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue