mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge commit '947538acb8
' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # CMakePresets.json # examples/llama-bench/llama-bench.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-quantize-fns.cpp
This commit is contained in:
commit
70cdb55cc9
29 changed files with 93418 additions and 92397 deletions
|
@ -1679,6 +1679,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
else { invalid_param = true; }
|
else { invalid_param = true; }
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--output-format") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||||
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||||
|
else { invalid_param = true; }
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--no-warmup") {
|
if (arg == "--no-warmup") {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
return true;
|
return true;
|
||||||
|
@ -2069,6 +2077,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||||
|
|
||||||
|
options.push_back({ "batched-bench" });
|
||||||
|
options.push_back({ "batched-bench", " --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
for (const auto & o : options) {
|
for (const auto & o : options) {
|
||||||
|
|
|
@ -299,6 +299,9 @@ struct gpt_params {
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_parse_from_env(gpt_params & params);
|
void gpt_params_parse_from_env(gpt_params & params);
|
||||||
|
|
|
@ -308,6 +308,20 @@ class Model:
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
if data_qtype is False and any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if self.ftype in (
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
):
|
||||||
|
# TODO: use Q4_K and Q6_K
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
|
@ -318,6 +332,10 @@ class Model:
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
|
@ -1623,15 +1641,16 @@ class BitnetModel(Model):
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
def weight_quant(self, weight):
|
def weight_quant(self, weight: Tensor) -> Tensor:
|
||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
scale = weight.abs().mean().clamp(min=1e-5)
|
||||||
weight = (weight * s).round().clamp(-1, 1) / s
|
iscale = 1 / scale
|
||||||
scale = weight.abs().max().unsqueeze(0)
|
# TODO: multiply by the scale directly instead of inverting it twice
|
||||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
# (this is also unnecessarily doubly inverted upstream)
|
||||||
weight = torch.sign(weight).type(dtype)
|
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
||||||
return weight.type(dtype), scale.type(torch.float32)
|
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
||||||
|
return result.type(dtype)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
|
@ -1646,11 +1665,9 @@ class BitnetModel(Model):
|
||||||
gguf.MODEL_TENSOR.FFN_GATE,
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
]):
|
]):
|
||||||
# transform weight into 1/0/-1 (in fp32)
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
data_torch = self.weight_quant(data_torch)
|
||||||
yield (new_name, weight_torch)
|
|
||||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
yield (new_name, data_torch)
|
||||||
else:
|
|
||||||
yield (new_name, data_torch)
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
|
@ -4011,8 +4028,8 @@ def parse_args() -> argparse.Namespace:
|
||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
|
@ -4099,6 +4116,8 @@ def main() -> None:
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -49,3 +49,12 @@ There are 2 modes of operation:
|
||||||
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
||||||
|
|
||||||
|
### JSONL output
|
||||||
|
|
||||||
|
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
|
||||||
|
```
|
||||||
|
|
|
@ -122,12 +122,13 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
if (!params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("\n");
|
||||||
LOG_TEE("\n");
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
|
LOG_TEE("\n");
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
|
@ -195,7 +196,16 @@ int main(int argc, char ** argv) {
|
||||||
const float speed_tg = pl*tg / t_tg;
|
const float speed_tg = pl*tg / t_tg;
|
||||||
const float speed = n_kv / t;
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
if(params.batched_bench_output_jsonl) {
|
||||||
|
LOG_TEE(
|
||||||
|
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||||
|
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||||
|
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||||
|
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
|
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
||||||
|
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||||
|
|
|
@ -51,15 +51,12 @@ enum stop_type {
|
||||||
STOP_TYPE_PARTIAL,
|
STOP_TYPE_PARTIAL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||||
enum slot_state {
|
enum slot_state {
|
||||||
SLOT_STATE_IDLE,
|
SLOT_STATE_IDLE,
|
||||||
SLOT_STATE_PROCESSING,
|
SLOT_STATE_PROCESSING_PROMPT,
|
||||||
};
|
SLOT_STATE_DONE_PROMPT,
|
||||||
|
SLOT_STATE_GENERATING,
|
||||||
enum slot_command {
|
|
||||||
SLOT_COMMAND_NONE,
|
|
||||||
SLOT_COMMAND_LOAD_PROMPT,
|
|
||||||
SLOT_COMMAND_RELEASE,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum server_state {
|
enum server_state {
|
||||||
|
@ -136,7 +133,6 @@ struct server_slot {
|
||||||
struct slot_params params;
|
struct slot_params params;
|
||||||
|
|
||||||
slot_state state = SLOT_STATE_IDLE;
|
slot_state state = SLOT_STATE_IDLE;
|
||||||
slot_command command = SLOT_COMMAND_NONE;
|
|
||||||
|
|
||||||
// used to determine the slot that has been used the longest
|
// used to determine the slot that has been used the longest
|
||||||
int64_t t_last_used = -1;
|
int64_t t_last_used = -1;
|
||||||
|
@ -195,6 +191,8 @@ struct server_slot {
|
||||||
double t_prompt_processing; // ms
|
double t_prompt_processing; // ms
|
||||||
double t_token_generation; // ms
|
double t_token_generation; // ms
|
||||||
|
|
||||||
|
std::function<void(int)> callback_on_release;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
n_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
|
@ -229,25 +227,28 @@ struct server_slot {
|
||||||
return n_remaining > 0; // no budget
|
return n_remaining > 0; // no budget
|
||||||
}
|
}
|
||||||
|
|
||||||
bool available() const {
|
|
||||||
return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_processing() const {
|
bool is_processing() const {
|
||||||
return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
|
return state != SLOT_STATE_IDLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_token_string(const completion_token_output & token) {
|
void add_token_string(const completion_token_output & token) {
|
||||||
if (command == SLOT_COMMAND_RELEASE) {
|
if (!is_processing()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
generated_token_probs.push_back(token);
|
generated_token_probs.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
void release() {
|
void release() {
|
||||||
if (state == SLOT_STATE_PROCESSING) {
|
if (is_processing()) {
|
||||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||||
command = SLOT_COMMAND_RELEASE;
|
state = SLOT_STATE_IDLE;
|
||||||
|
LOG_INFO("slot released", {
|
||||||
|
{"id_slot", id},
|
||||||
|
{"id_task", id_task},
|
||||||
|
{"n_past", n_past},
|
||||||
|
{"truncated", truncated},
|
||||||
|
});
|
||||||
|
callback_on_release(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -354,6 +355,9 @@ struct server_metrics {
|
||||||
uint64_t n_tokens_predicted = 0;
|
uint64_t n_tokens_predicted = 0;
|
||||||
uint64_t t_tokens_generation = 0;
|
uint64_t t_tokens_generation = 0;
|
||||||
|
|
||||||
|
uint64_t n_decode_total = 0;
|
||||||
|
uint64_t n_busy_slots_total = 0;
|
||||||
|
|
||||||
void init() {
|
void init() {
|
||||||
t_start = ggml_time_us();
|
t_start = ggml_time_us();
|
||||||
}
|
}
|
||||||
|
@ -372,6 +376,15 @@ struct server_metrics {
|
||||||
t_tokens_generation_total += slot.t_token_generation;
|
t_tokens_generation_total += slot.t_token_generation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void on_decoded(const std::vector<server_slot> & slots) {
|
||||||
|
n_decode_total++;
|
||||||
|
for (const auto & slot : slots) {
|
||||||
|
if (slot.is_processing()) {
|
||||||
|
n_busy_slots_total++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void reset_bucket() {
|
void reset_bucket() {
|
||||||
n_prompt_tokens_processed = 0;
|
n_prompt_tokens_processed = 0;
|
||||||
t_prompt_processing = 0;
|
t_prompt_processing = 0;
|
||||||
|
@ -413,6 +426,7 @@ struct server_queue {
|
||||||
|
|
||||||
// multi-task version of post()
|
// multi-task version of post()
|
||||||
int post(std::vector<server_task> & tasks, bool front = false) {
|
int post(std::vector<server_task> & tasks, bool front = false) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
for (auto & task : tasks) {
|
for (auto & task : tasks) {
|
||||||
if (task.id == -1) {
|
if (task.id == -1) {
|
||||||
task.id = id++;
|
task.id = id++;
|
||||||
|
@ -432,6 +446,7 @@ struct server_queue {
|
||||||
void defer(server_task task) {
|
void defer(server_task task) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
queue_tasks_deferred.push_back(std::move(task));
|
queue_tasks_deferred.push_back(std::move(task));
|
||||||
|
condition_tasks.notify_one();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the next id for creating a new task
|
// Get the next id for creating a new task
|
||||||
|
@ -452,14 +467,14 @@ struct server_queue {
|
||||||
callback_update_slots = std::move(callback);
|
callback_update_slots = std::move(callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call when the state of one slot is changed
|
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
||||||
void notify_slot_changed() {
|
void pop_deferred_task() {
|
||||||
// move deferred tasks back to main loop
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
for (auto & task : queue_tasks_deferred) {
|
if (!queue_tasks_deferred.empty()) {
|
||||||
queue_tasks.push_back(std::move(task));
|
queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
|
||||||
|
queue_tasks_deferred.pop_front();
|
||||||
}
|
}
|
||||||
queue_tasks_deferred.clear();
|
condition_tasks.notify_one();
|
||||||
}
|
}
|
||||||
|
|
||||||
// end the start_loop routine
|
// end the start_loop routine
|
||||||
|
@ -489,7 +504,7 @@ struct server_queue {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
server_task task = queue_tasks.front();
|
server_task task = queue_tasks.front();
|
||||||
queue_tasks.erase(queue_tasks.begin());
|
queue_tasks.pop_front();
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
|
LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
|
||||||
callback_new_task(task);
|
callback_new_task(task);
|
||||||
|
@ -717,6 +732,10 @@ struct server_context {
|
||||||
|
|
||||||
slot.sparams = params.sparams;
|
slot.sparams = params.sparams;
|
||||||
|
|
||||||
|
slot.callback_on_release = [this](int) {
|
||||||
|
queue_tasks.pop_deferred_task();
|
||||||
|
};
|
||||||
|
|
||||||
slot.reset();
|
slot.reset();
|
||||||
|
|
||||||
slots.push_back(slot);
|
slots.push_back(slot);
|
||||||
|
@ -798,7 +817,7 @@ struct server_context {
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
// skip the slot if it is not available
|
// skip the slot if it is not available
|
||||||
if (!slot.available()) {
|
if (slot.is_processing()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -840,7 +859,7 @@ struct server_context {
|
||||||
int64_t t_last = ggml_time_us();
|
int64_t t_last = ggml_time_us();
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
// skip the slot if it is not available
|
// skip the slot if it is not available
|
||||||
if (!slot.available()) {
|
if (slot.is_processing()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1078,7 +1097,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
||||||
slot.prompt_tokens.clear();
|
slot.prompt_tokens.clear();
|
||||||
|
|
||||||
LOG_INFO("slot is processing task", {
|
LOG_INFO("slot is processing task", {
|
||||||
|
@ -1622,7 +1641,7 @@ struct server_context {
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!slot->available()) {
|
if (slot->is_processing()) {
|
||||||
// if requested slot is unavailable, we defer this task for processing later
|
// if requested slot is unavailable, we defer this task for processing later
|
||||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
|
@ -1728,6 +1747,9 @@ struct server_context {
|
||||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||||
|
|
||||||
|
{ "n_decode_total", metrics.n_decode_total},
|
||||||
|
{ "n_busy_slots_total", metrics.n_busy_slots_total},
|
||||||
|
|
||||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||||
|
|
||||||
|
@ -1747,7 +1769,7 @@ struct server_context {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!slot->available()) {
|
if (slot->is_processing()) {
|
||||||
// if requested slot is unavailable, we defer this task for processing later
|
// if requested slot is unavailable, we defer this task for processing later
|
||||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
|
@ -1788,7 +1810,7 @@ struct server_context {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!slot->available()) {
|
if (slot->is_processing()) {
|
||||||
// if requested slot is unavailable, we defer this task for processing later
|
// if requested slot is unavailable, we defer this task for processing later
|
||||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
|
@ -1836,7 +1858,7 @@ struct server_context {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!slot->available()) {
|
if (slot->is_processing()) {
|
||||||
// if requested slot is unavailable, we defer this task for processing later
|
// if requested slot is unavailable, we defer this task for processing later
|
||||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||||
queue_tasks.defer(task);
|
queue_tasks.defer(task);
|
||||||
|
@ -1876,33 +1898,12 @@ struct server_context {
|
||||||
system_prompt_update();
|
system_prompt_update();
|
||||||
}
|
}
|
||||||
|
|
||||||
// release slots
|
|
||||||
for (auto & slot : slots) {
|
|
||||||
if (slot.command == SLOT_COMMAND_RELEASE) {
|
|
||||||
slot.state = SLOT_STATE_IDLE;
|
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
|
||||||
slot.t_last_used = ggml_time_us();
|
|
||||||
|
|
||||||
LOG_INFO("slot released", {
|
|
||||||
{"id_slot", slot.id},
|
|
||||||
{"id_task", slot.id_task},
|
|
||||||
{"n_ctx", n_ctx},
|
|
||||||
{"n_past", slot.n_past},
|
|
||||||
{"n_system_tokens", system_tokens.size()},
|
|
||||||
{"n_cache_tokens", slot.cache_tokens.size()},
|
|
||||||
{"truncated", slot.truncated}
|
|
||||||
});
|
|
||||||
|
|
||||||
queue_tasks.notify_slot_changed();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if all slots are idle
|
// check if all slots are idle
|
||||||
{
|
{
|
||||||
bool all_idle = true;
|
bool all_idle = true;
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
|
if (slot.is_processing()) {
|
||||||
all_idle = false;
|
all_idle = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1973,7 +1974,7 @@ struct server_context {
|
||||||
|
|
||||||
// frist, add sampled tokens from any ongoing sequences
|
// frist, add sampled tokens from any ongoing sequences
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state == SLOT_STATE_IDLE) {
|
if (slot.state != SLOT_STATE_GENERATING) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2015,7 +2016,7 @@ struct server_context {
|
||||||
if (params.cont_batching || batch.n_tokens == 0) {
|
if (params.cont_batching || batch.n_tokens == 0) {
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
// this slot still has a prompt to be processed
|
// this slot still has a prompt to be processed
|
||||||
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
|
if (slot.state == SLOT_STATE_PROCESSING_PROMPT) {
|
||||||
auto & prompt_tokens = slot.prompt_tokens;
|
auto & prompt_tokens = slot.prompt_tokens;
|
||||||
|
|
||||||
// we haven't tokenized the prompt yet - do it now:
|
// we haven't tokenized the prompt yet - do it now:
|
||||||
|
@ -2083,8 +2084,6 @@ struct server_context {
|
||||||
{"id_task", slot.id_task}
|
{"id_task", slot.id_task}
|
||||||
});
|
});
|
||||||
|
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
|
@ -2094,8 +2093,6 @@ struct server_context {
|
||||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
||||||
// this prompt is too large to process - discard it
|
// this prompt is too large to process - discard it
|
||||||
if (slot.n_prompt_tokens > n_ubatch) {
|
if (slot.n_prompt_tokens > n_ubatch) {
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
||||||
continue;
|
continue;
|
||||||
|
@ -2253,10 +2250,9 @@ struct server_context {
|
||||||
{"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
|
{"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
|
||||||
});
|
});
|
||||||
|
|
||||||
// entire prompt has been processed - start decoding new tokens
|
// entire prompt has been processed
|
||||||
if (slot.n_past == slot.n_prompt_tokens) {
|
if (slot.n_past == slot.n_prompt_tokens) {
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
slot.state = SLOT_STATE_DONE_PROMPT;
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
|
||||||
|
|
||||||
GGML_ASSERT(batch.n_tokens > 0);
|
GGML_ASSERT(batch.n_tokens > 0);
|
||||||
|
|
||||||
|
@ -2338,18 +2334,17 @@ struct server_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
metrics.on_decoded(slots);
|
||||||
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
||||||
{"i", i},
|
{"i", i},
|
||||||
{"n_batch", ret},
|
{"n_batch", n_batch},
|
||||||
{"ret", ret},
|
{"ret", ret},
|
||||||
});
|
});
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
||||||
}
|
}
|
||||||
|
@ -2361,24 +2356,31 @@ struct server_context {
|
||||||
i -= n_batch;
|
i -= n_batch;
|
||||||
|
|
||||||
LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
|
LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
|
||||||
{"i", i},
|
{"i", i},
|
||||||
{"n_batch", n_batch},
|
{"n_batch", n_batch},
|
||||||
{"ret", ret},
|
{"ret", ret},
|
||||||
});
|
});
|
||||||
|
|
||||||
continue; // continue loop of n_batch
|
continue; // continue loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||||
continue; // continue loop of slots
|
continue; // continue loop of slots
|
||||||
}
|
}
|
||||||
|
|
||||||
// prompt evaluated for embedding
|
if (slot.state == SLOT_STATE_DONE_PROMPT) {
|
||||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
||||||
send_embedding(slot, batch_view);
|
// prompt evaluated for embedding
|
||||||
slot.release();
|
send_embedding(slot, batch_view);
|
||||||
slot.i_batch = -1;
|
slot.release();
|
||||||
|
slot.i_batch = -1;
|
||||||
|
continue; // continue loop of slots
|
||||||
|
} else {
|
||||||
|
// prompt evaluated for next-token prediction
|
||||||
|
slot.state = SLOT_STATE_GENERATING;
|
||||||
|
}
|
||||||
|
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||||
continue; // continue loop of slots
|
continue; // continue loop of slots
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2425,6 +2427,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!process_token(result, slot)) {
|
if (!process_token(result, slot)) {
|
||||||
|
// release slot because of stop condition
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
|
@ -2705,7 +2708,7 @@ int main(int argc, char ** argv) {
|
||||||
task.type = SERVER_TASK_TYPE_METRICS;
|
task.type = SERVER_TASK_TYPE_METRICS;
|
||||||
|
|
||||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||||
ctx_server.queue_tasks.post(task);
|
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||||
|
|
||||||
// get the result
|
// get the result
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||||
|
@ -2737,7 +2740,7 @@ int main(int argc, char ** argv) {
|
||||||
task.data.push_back({{"reset_bucket", true}});
|
task.data.push_back({{"reset_bucket", true}});
|
||||||
|
|
||||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||||
ctx_server.queue_tasks.post(task);
|
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||||
|
|
||||||
// get the result
|
// get the result
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||||
|
@ -2751,6 +2754,9 @@ int main(int argc, char ** argv) {
|
||||||
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
|
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
|
||||||
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
|
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
|
||||||
|
|
||||||
|
const uint64_t n_decode_total = data.at("n_decode_total");
|
||||||
|
const uint64_t n_busy_slots_total = data.at("n_busy_slots_total");
|
||||||
|
|
||||||
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
|
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
|
||||||
|
|
||||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
|
@ -2771,6 +2777,14 @@ int main(int argc, char ** argv) {
|
||||||
{"name", "tokens_predicted_seconds_total"},
|
{"name", "tokens_predicted_seconds_total"},
|
||||||
{"help", "Predict process time"},
|
{"help", "Predict process time"},
|
||||||
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
|
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
|
||||||
|
}, {
|
||||||
|
{"name", "n_decode_total"},
|
||||||
|
{"help", "Total number of llama_decode() calls"},
|
||||||
|
{"value", n_decode_total}
|
||||||
|
}, {
|
||||||
|
{"name", "n_busy_slots_per_decode"},
|
||||||
|
{"help", "Average number of busy slots per llama_decode() call"},
|
||||||
|
{"value", (float) n_busy_slots_total / (float) n_decode_total}
|
||||||
}}},
|
}}},
|
||||||
{"gauge", {{
|
{"gauge", {{
|
||||||
{"name", "prompt_tokens_seconds"},
|
{"name", "prompt_tokens_seconds"},
|
||||||
|
@ -2837,7 +2851,7 @@ int main(int argc, char ** argv) {
|
||||||
task.data = {
|
task.data = {
|
||||||
{ "id_slot", id_slot },
|
{ "id_slot", id_slot },
|
||||||
{ "filename", filename },
|
{ "filename", filename },
|
||||||
{ "filepath", filepath }
|
{ "filepath", filepath },
|
||||||
};
|
};
|
||||||
|
|
||||||
const int id_task = ctx_server.queue_tasks.post(task);
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
@ -2867,7 +2881,7 @@ int main(int argc, char ** argv) {
|
||||||
task.data = {
|
task.data = {
|
||||||
{ "id_slot", id_slot },
|
{ "id_slot", id_slot },
|
||||||
{ "filename", filename },
|
{ "filename", filename },
|
||||||
{ "filepath", filepath }
|
{ "filepath", filepath },
|
||||||
};
|
};
|
||||||
|
|
||||||
const int id_task = ctx_server.queue_tasks.post(task);
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
@ -2945,7 +2959,7 @@ int main(int argc, char ** argv) {
|
||||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params.n_parallel },
|
{ "total_slots", ctx_server.params.n_parallel },
|
||||||
{ "chat_template", curr_tmpl.c_str() }
|
{ "chat_template", curr_tmpl.c_str() },
|
||||||
};
|
};
|
||||||
|
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
|
@ -3056,13 +3070,13 @@ int main(int argc, char ** argv) {
|
||||||
json models = {
|
json models = {
|
||||||
{"object", "list"},
|
{"object", "list"},
|
||||||
{"data", {
|
{"data", {
|
||||||
{
|
{
|
||||||
{"id", params.model_alias},
|
{"id", params.model_alias},
|
||||||
{"object", "model"},
|
{"object", "model"},
|
||||||
{"created", std::time(0)},
|
{"created", std::time(0)},
|
||||||
{"owned_by", "llamacpp"},
|
{"owned_by", "llamacpp"},
|
||||||
{"meta", ctx_server.model_meta()}
|
{"meta", ctx_server.model_meta()}
|
||||||
},
|
},
|
||||||
}}
|
}}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,35 @@ Feature: Parallel
|
||||||
| disabled | 128 |
|
| disabled | 128 |
|
||||||
| enabled | 64 |
|
| enabled | 64 |
|
||||||
|
|
||||||
|
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||||
|
Given a system prompt You are a writer.
|
||||||
|
And a model tinyllama-2
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long book.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another a poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is LLM?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
The sky is blue and I love it.
|
||||||
|
"""
|
||||||
|
And <n_predict> max tokens to predict
|
||||||
|
And streaming is <streaming>
|
||||||
|
Given concurrent OAI completions requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all prompts are predicted with <n_predict> tokens
|
||||||
|
Examples:
|
||||||
|
| streaming | n_predict |
|
||||||
|
| disabled | 128 |
|
||||||
|
| enabled | 64 |
|
||||||
|
|
||||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
|
|
@ -15,6 +15,7 @@ Feature: Passkey / Self-extend with context shift
|
||||||
And <n_junk> as number of junk
|
And <n_junk> as number of junk
|
||||||
And <n_predicted> server max tokens to predict
|
And <n_predicted> server max tokens to predict
|
||||||
And 42 as seed
|
And 42 as seed
|
||||||
|
And 0.0 temperature
|
||||||
And <n_ctx> KV cache size
|
And <n_ctx> KV cache size
|
||||||
And 1 slots
|
And 1 slots
|
||||||
And <n_ga> group attention factor to extend context size through self-extend
|
And <n_ga> group attention factor to extend context size through self-extend
|
||||||
|
@ -22,7 +23,8 @@ Feature: Passkey / Self-extend with context shift
|
||||||
# Can be override with N_GPU_LAYERS
|
# Can be override with N_GPU_LAYERS
|
||||||
And <ngl> GPU offloaded layers
|
And <ngl> GPU offloaded layers
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
# Higher timeout because the model may need to be downloaded from the internet
|
||||||
|
Then the server is healthy with timeout 120 seconds
|
||||||
Given available models
|
Given available models
|
||||||
Then model 0 is trained on <n_ctx_train> tokens context
|
Then model 0 is trained on <n_ctx_train> tokens context
|
||||||
Given a prefix prompt:
|
Given a prefix prompt:
|
||||||
|
|
|
@ -202,17 +202,15 @@ def step_start_server(context):
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
@step("the server is {expecting_status}")
|
async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||||
@async_run_until_complete
|
|
||||||
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
|
||||||
match expecting_status:
|
match expecting_status:
|
||||||
case 'healthy':
|
case 'healthy':
|
||||||
await wait_for_slots_status(context, context.base_url, 200,
|
await wait_for_slots_status(context, context.base_url, 200,
|
||||||
timeout=30)
|
timeout=timeout)
|
||||||
|
|
||||||
case 'ready' | 'idle':
|
case 'ready' | 'idle':
|
||||||
await wait_for_slots_status(context, context.base_url, 200,
|
await wait_for_slots_status(context, context.base_url, 200,
|
||||||
timeout=30,
|
timeout=timeout,
|
||||||
params={'fail_on_no_slot': 1},
|
params={'fail_on_no_slot': 1},
|
||||||
slots_idle=context.n_slots,
|
slots_idle=context.n_slots,
|
||||||
slots_processing=0)
|
slots_processing=0)
|
||||||
|
@ -225,6 +223,18 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status: Lite
|
||||||
assert False, "unknown status"
|
assert False, "unknown status"
|
||||||
|
|
||||||
|
|
||||||
|
@step("the server is {expecting_status} with timeout {timeout:d} seconds")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||||
|
await wait_for_server_status_with_timeout(context, expecting_status, timeout)
|
||||||
|
|
||||||
|
|
||||||
|
@step("the server is {expecting_status}")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||||
|
await wait_for_server_status_with_timeout(context, expecting_status, 30)
|
||||||
|
|
||||||
|
|
||||||
@step('all slots are {expected_slot_status_string}')
|
@step('all slots are {expected_slot_status_string}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||||
|
|
|
@ -401,6 +401,8 @@ extern "C" {
|
||||||
GGML_TYPE_Q4_0_4_4 = 31,
|
GGML_TYPE_Q4_0_4_4 = 31,
|
||||||
GGML_TYPE_Q4_0_4_8 = 32,
|
GGML_TYPE_Q4_0_4_8 = 32,
|
||||||
GGML_TYPE_Q4_0_8_8 = 33,
|
GGML_TYPE_Q4_0_8_8 = 33,
|
||||||
|
GGML_TYPE_TQ1_0 = 34,
|
||||||
|
GGML_TYPE_TQ2_0 = 35,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1165,6 +1165,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||||
|
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||||
|
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
||||||
|
}
|
||||||
|
|
||||||
// graph input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
|
@ -1644,7 +1649,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
sched->prev_leaf_backend_ids = tmp;
|
sched->prev_leaf_backend_ids = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
||||||
if (sched->graph.size < graph_size) {
|
if (sched->graph.size < graph_size) {
|
||||||
sched->graph.size = graph_size;
|
sched->graph.size = graph_size;
|
||||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||||
|
@ -1696,6 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1709,6 +1715,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1719,6 +1726,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -227,6 +227,25 @@ typedef struct {
|
||||||
} block_q8_0x8;
|
} block_q8_0x8;
|
||||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||||
|
|
||||||
|
//
|
||||||
|
// Ternary quantization
|
||||||
|
//
|
||||||
|
|
||||||
|
// 1.6875 bpw
|
||||||
|
typedef struct {
|
||||||
|
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
||||||
|
uint8_t qh[QK_K/64]; // 4 elements per byte
|
||||||
|
ggml_half d;
|
||||||
|
} block_tq1_0;
|
||||||
|
static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
||||||
|
|
||||||
|
// 2.0625 bpw
|
||||||
|
typedef struct {
|
||||||
|
uint8_t qs[QK_K/4]; // 2 bits per element
|
||||||
|
ggml_half d;
|
||||||
|
} block_tq2_0;
|
||||||
|
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
//
|
//
|
||||||
|
@ -361,6 +380,7 @@ typedef struct {
|
||||||
} block_iq3_s;
|
} block_iq3_s;
|
||||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||||
|
|
||||||
|
// 1.5625 bpw
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint8_t qs[QK_K/8];
|
uint8_t qs[QK_K/8];
|
||||||
|
|
|
@ -2576,8 +2576,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||||
// store a pointer to each copy op CUDA kernel to identify it later
|
// store a pointer to each copy op CUDA kernel to identify it later
|
||||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
if (!ptr) {
|
||||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
use_cuda_graph = false;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||||
|
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2846,6 +2853,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
|
|
|
@ -428,7 +428,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
char * src0_ddc = (char *) src0->data;
|
char * src0_ddc = (char *) src0->data;
|
||||||
char * src1_ddc = (char *) src1->data;
|
char * src1_ddc = (char *) src1->data;
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
|
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
|
@ -449,9 +452,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -461,29 +463,30 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
return nullptr;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -175,7 +175,7 @@ typedef __fp16 ggml_fp16_internal_t;
|
||||||
|
|
||||||
// 32-bit ARM compatibility
|
// 32-bit ARM compatibility
|
||||||
|
|
||||||
// vaddvq_s16
|
// vaddlvq_s16
|
||||||
// vpaddq_s16
|
// vpaddq_s16
|
||||||
// vpaddq_s32
|
// vpaddq_s32
|
||||||
// vaddvq_s32
|
// vaddvq_s32
|
||||||
|
@ -185,12 +185,9 @@ typedef __fp16 ggml_fp16_internal_t;
|
||||||
// vzip1_u8
|
// vzip1_u8
|
||||||
// vzip2_u8
|
// vzip2_u8
|
||||||
|
|
||||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||||
return
|
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
|
||||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||||
|
|
|
@ -1631,7 +1631,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6
|
||||||
// ===================== Helper functions
|
// ===================== Helper functions
|
||||||
//
|
//
|
||||||
static inline int nearest_int(float fval) {
|
static inline int nearest_int(float fval) {
|
||||||
assert(fval <= 4194303.f);
|
assert(fabsf(fval) <= 4194303.f);
|
||||||
float val = fval + 12582912.f;
|
float val = fval + 12582912.f;
|
||||||
int i; memcpy(&i, &val, sizeof(int));
|
int i; memcpy(&i, &val, sizeof(int));
|
||||||
return (i & 0x007fffff) - 0x00400000;
|
return (i & 0x007fffff) - 0x00400000;
|
||||||
|
@ -3307,6 +3307,191 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
||||||
return nrow * row_size;
|
return nrow * row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
||||||
|
|
||||||
|
void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int64_t nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < nb; i++) {
|
||||||
|
float amax = 0.0f; // absolute max
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K; j++) {
|
||||||
|
const float v = x[j];
|
||||||
|
amax = MAX(amax, fabsf(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = amax;
|
||||||
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
|
y[i].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
|
// 5 elements per byte, along 32 bytes
|
||||||
|
for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) {
|
||||||
|
for (size_t m = 0; m < 32; ++m) {
|
||||||
|
uint8_t q = 0;
|
||||||
|
for (size_t n = 0; n < 5; ++n) {
|
||||||
|
int xi = lroundf(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2
|
||||||
|
q *= 3;
|
||||||
|
q += xi;
|
||||||
|
}
|
||||||
|
// ceiling division (243 == pow(3, 5))
|
||||||
|
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||||
|
y[i].qs[j + m] = q;
|
||||||
|
}
|
||||||
|
x += 5*32;
|
||||||
|
}
|
||||||
|
// along 16 bytes
|
||||||
|
for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) {
|
||||||
|
for (size_t m = 0; m < 16; ++m) {
|
||||||
|
uint8_t q = 0;
|
||||||
|
for (size_t n = 0; n < 5; ++n) {
|
||||||
|
int xi = lroundf(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2
|
||||||
|
q *= 3;
|
||||||
|
q += xi;
|
||||||
|
}
|
||||||
|
// ceiling division (243 == pow(3, 5))
|
||||||
|
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||||
|
y[i].qs[j + m] = q;
|
||||||
|
}
|
||||||
|
x += 5*16;
|
||||||
|
}
|
||||||
|
// 4 elements per byte
|
||||||
|
for (size_t j = 0; j < sizeof(y->qh); ++j) {
|
||||||
|
uint8_t q = 0;
|
||||||
|
for (size_t m = 0; m < 4; ++m) {
|
||||||
|
// -1, 0, 1 -> 0, 1, 2
|
||||||
|
int xi = lroundf(x[j + m*sizeof(y->qh)] * id) + 1;
|
||||||
|
q *= 3;
|
||||||
|
q += xi;
|
||||||
|
}
|
||||||
|
// shift the first value to the most significant trit
|
||||||
|
q *= 3;
|
||||||
|
// ceiling division (243 == pow(3, 5))
|
||||||
|
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||||
|
y[i].qh[j] = q;
|
||||||
|
}
|
||||||
|
x += 4*sizeof(y->qh);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int64_t nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < nb; i++) {
|
||||||
|
float amax = 0.0f; // absolute max
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K; j++) {
|
||||||
|
const float v = x[j];
|
||||||
|
amax = MAX(amax, fabsf(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = amax;
|
||||||
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
|
||||||
|
y[i].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(y->qs); j += 32) {
|
||||||
|
for (size_t m = 0; m < 32; ++m) {
|
||||||
|
uint8_t q = 0;
|
||||||
|
for (size_t n = 0; n < 4; ++n) {
|
||||||
|
// -1, 0, 1 -> 0, 1, 2
|
||||||
|
int xi = lroundf(x[m + n*32] * id) + 1;
|
||||||
|
q += (xi & 3) << (2*n);
|
||||||
|
}
|
||||||
|
y[i].qs[j + m] = q;
|
||||||
|
}
|
||||||
|
x += 4*32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
block_tq1_0 * restrict y = vy;
|
||||||
|
quantize_row_tq1_0_ref(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
block_tq2_0 * restrict y = vy;
|
||||||
|
quantize_row_tq2_0_ref(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
|
(void)quant_weights; // not used
|
||||||
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
||||||
|
quantize_row_tq1_0(src, dst, (int64_t)nrow*n_per_row);
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
|
(void)quant_weights; // not used
|
||||||
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
|
||||||
|
quantize_row_tq2_0(src, dst, (int64_t)nrow*n_per_row);
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int64_t nb = k / QK_K;
|
||||||
|
|
||||||
|
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||||
|
for (size_t n = 0; n < 5; ++n) {
|
||||||
|
for (size_t m = 0; m < 32; ++m) {
|
||||||
|
uint8_t q = x[i].qs[j + m] * pow3[n];
|
||||||
|
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
*y++ = (float) (xi - 1) * d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||||
|
for (size_t n = 0; n < 5; ++n) {
|
||||||
|
for (size_t m = 0; m < 16; ++m) {
|
||||||
|
uint8_t q = x[i].qs[j + m] * pow3[n];
|
||||||
|
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
*y++ = (float) (xi - 1) * d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t n = 0; n < 4; ++n) {
|
||||||
|
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||||
|
uint8_t q = x[i].qh[j] * pow3[n];
|
||||||
|
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
*y++ = (float) (xi - 1) * d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int64_t nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||||
|
for (size_t l = 0; l < 4; ++l) {
|
||||||
|
for (size_t m = 0; m < 32; ++m) {
|
||||||
|
int8_t q = (x[i].qs[j + m] >> (l*2)) & 3;
|
||||||
|
*y++ = (float) (q - 1) * d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== "True" 2-bit (de)-quantization
|
// ====================== "True" 2-bit (de)-quantization
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
||||||
|
@ -5471,6 +5656,501 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
const block_tq1_0 * restrict x = vx;
|
||||||
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON)
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
|
||||||
|
|
||||||
|
const uint8x16_t shift = vld1q_u8(k_shift);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
int32x4_t sumi0 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi1 = vdupq_n_s32(0);
|
||||||
|
#else
|
||||||
|
int16x8_t sumi0 = vdupq_n_s16(0);
|
||||||
|
int16x8_t sumi1 = vdupq_n_s16(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// first 32 bytes of 5 elements
|
||||||
|
{
|
||||||
|
uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
|
||||||
|
uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
|
||||||
|
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
|
||||||
|
uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
|
||||||
|
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
|
||||||
|
uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
|
||||||
|
uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
|
||||||
|
uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
|
||||||
|
uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
|
||||||
|
uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
|
||||||
|
|
||||||
|
// multiply by 3 and keep the 2 bits above 8 bits
|
||||||
|
int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
|
||||||
|
int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
|
||||||
|
int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
|
||||||
|
int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
|
||||||
|
int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
|
||||||
|
int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
|
||||||
|
int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
|
||||||
|
int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
|
||||||
|
int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
|
||||||
|
int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
|
||||||
|
|
||||||
|
const int8x16_t qy0 = vld1q_s8(y[i].qs + 0);
|
||||||
|
const int8x16_t qy1 = vld1q_s8(y[i].qs + 16);
|
||||||
|
const int8x16_t qy2 = vld1q_s8(y[i].qs + 32);
|
||||||
|
const int8x16_t qy3 = vld1q_s8(y[i].qs + 48);
|
||||||
|
const int8x16_t qy4 = vld1q_s8(y[i].qs + 64);
|
||||||
|
const int8x16_t qy5 = vld1q_s8(y[i].qs + 80);
|
||||||
|
const int8x16_t qy6 = vld1q_s8(y[i].qs + 96);
|
||||||
|
const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
|
||||||
|
const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
|
||||||
|
const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx6, qy6);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx7, qy7);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx8, qy8);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx9, qy9);
|
||||||
|
#else
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
|
||||||
|
{
|
||||||
|
uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
|
||||||
|
uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
|
||||||
|
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
|
||||||
|
uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
|
||||||
|
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
|
||||||
|
uint32_t qh;
|
||||||
|
memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
|
||||||
|
uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
|
||||||
|
qx5 = vmulq_u8(qx5, shift);
|
||||||
|
|
||||||
|
// multiply by 3 and keep the 2 bits above 8 bits
|
||||||
|
int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
|
||||||
|
int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
|
||||||
|
int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
|
||||||
|
int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
|
||||||
|
int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
|
||||||
|
int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
|
||||||
|
|
||||||
|
const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
|
||||||
|
const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
|
||||||
|
const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
|
||||||
|
const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
|
||||||
|
const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
|
||||||
|
const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||||
|
#else
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||||
|
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||||
|
sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
|
||||||
|
|
||||||
|
sumf += d * (float) vaddvq_s32(sumi0);
|
||||||
|
#else
|
||||||
|
sumi0 = vaddq_s16(sumi0, sumi1);
|
||||||
|
sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
|
||||||
|
|
||||||
|
sumf += d * (float) vaddlvq_s16(sumi0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
__m256 sumf = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
// 16-bit sums
|
||||||
|
__m256i sumi0 = _mm256_setzero_si256();
|
||||||
|
__m256i sumi1 = _mm256_setzero_si256();
|
||||||
|
__m256i sumi2 = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
// first 32 bytes of 5 elements
|
||||||
|
{
|
||||||
|
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
|
||||||
|
// 8-bit multiplies with shifts, masks and adds
|
||||||
|
__m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
|
||||||
|
__m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
|
||||||
|
__m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
|
||||||
|
__m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
|
||||||
|
|
||||||
|
// TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
|
||||||
|
|
||||||
|
// Cancel the +1 from avg so that it behaves like a halving add
|
||||||
|
qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
|
||||||
|
qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
|
||||||
|
qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
|
||||||
|
qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
|
||||||
|
qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
|
||||||
|
// Multiply by 3 and get the top 2 bits
|
||||||
|
qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
|
||||||
|
qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
|
||||||
|
qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
|
||||||
|
qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
|
||||||
|
qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
|
||||||
|
qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
|
||||||
|
qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
|
||||||
|
qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
|
||||||
|
qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
|
||||||
|
qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
|
||||||
|
|
||||||
|
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0));
|
||||||
|
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32));
|
||||||
|
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64));
|
||||||
|
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96));
|
||||||
|
const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
|
||||||
|
|
||||||
|
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
||||||
|
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
||||||
|
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
||||||
|
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
||||||
|
qx4 = _mm256_maddubs_epi16(qx4, qy4);
|
||||||
|
|
||||||
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
||||||
|
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
||||||
|
sumi2 = _mm256_add_epi16(sumi2, qx4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
|
||||||
|
{
|
||||||
|
__m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
|
||||||
|
uint32_t qh;
|
||||||
|
memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
|
||||||
|
__m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
|
||||||
|
__m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
|
||||||
|
__m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
|
||||||
|
__m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
|
||||||
|
__m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
|
||||||
|
__m256i qx01 = MM256_SET_M128I(qx1, qx0);
|
||||||
|
__m256i qx23 = MM256_SET_M128I(qx3, qx2);
|
||||||
|
|
||||||
|
// avx2 does not have 8-bit multiplies, so 16-bit it is.
|
||||||
|
qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
|
||||||
|
qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
|
||||||
|
__m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
|
||||||
|
|
||||||
|
__m256i qx45 = MM256_SET_M128I(qx5, qx4);
|
||||||
|
|
||||||
|
// Cancel the +1 from avg so that it behaves like a halving add
|
||||||
|
qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
|
||||||
|
qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
|
||||||
|
qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
|
||||||
|
// Multiply by 3 and get the top 2 bits
|
||||||
|
qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
|
||||||
|
qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
|
||||||
|
qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
|
||||||
|
qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
|
||||||
|
qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
|
||||||
|
qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
|
||||||
|
|
||||||
|
const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
|
||||||
|
const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
|
||||||
|
const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
|
||||||
|
|
||||||
|
qx01 = _mm256_maddubs_epi16(qx01, qy01);
|
||||||
|
qx23 = _mm256_maddubs_epi16(qx23, qy23);
|
||||||
|
qx45 = _mm256_maddubs_epi16(qx45, qy45);
|
||||||
|
|
||||||
|
sumi0 = _mm256_add_epi16(sumi0, qx01);
|
||||||
|
sumi1 = _mm256_add_epi16(sumi1, qx23);
|
||||||
|
sumi2 = _mm256_add_epi16(sumi2, qx45);
|
||||||
|
}
|
||||||
|
|
||||||
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||||
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||||
|
|
||||||
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||||
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
|
||||||
|
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
||||||
|
|
||||||
|
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(sumf);
|
||||||
|
|
||||||
|
#else
|
||||||
|
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
int sum = 0;
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||||
|
for (size_t l = 0; l < 5; ++l) {
|
||||||
|
for (size_t m = 0; m < 32; ++m) {
|
||||||
|
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||||
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||||
|
for (size_t l = 0; l < 5; ++l) {
|
||||||
|
for (size_t m = 0; m < 16; ++m) {
|
||||||
|
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||||
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t l = 0; l < 4; ++l) {
|
||||||
|
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||||
|
uint8_t q = x[i].qh[j] * pow3[l];
|
||||||
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||||
|
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
const block_tq2_0 * restrict x = vx;
|
||||||
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
||||||
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
|
#if defined(__ARM_NEON)
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
const uint8x16_t m3 = vdupq_n_u8(3);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
int32x4_t sumi0 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi1 = vdupq_n_s32(0);
|
||||||
|
#else
|
||||||
|
int16x8_t sumi0 = vdupq_n_s16(0);
|
||||||
|
int16x8_t sumi1 = vdupq_n_s16(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||||
|
uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
|
||||||
|
uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
|
||||||
|
uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
|
||||||
|
uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
|
||||||
|
uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
|
||||||
|
uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
|
||||||
|
uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
|
||||||
|
uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
|
||||||
|
|
||||||
|
int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
|
||||||
|
int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
|
||||||
|
int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
|
||||||
|
int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
|
||||||
|
int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
|
||||||
|
int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
|
||||||
|
int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
|
||||||
|
int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
|
||||||
|
|
||||||
|
const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0);
|
||||||
|
const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16);
|
||||||
|
const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32);
|
||||||
|
const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48);
|
||||||
|
const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64);
|
||||||
|
const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80);
|
||||||
|
const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96);
|
||||||
|
const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||||
|
sumi0 = vdotq_s32(sumi0, sqx6, qy6);
|
||||||
|
sumi1 = vdotq_s32(sumi1, sqx7, qy7);
|
||||||
|
#else
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
|
||||||
|
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
|
||||||
|
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||||
|
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||||
|
sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
|
||||||
|
|
||||||
|
sumf += d * (float) vaddvq_s32(sumi0);
|
||||||
|
#else
|
||||||
|
sumi0 = vaddq_s16(sumi0, sumi1);
|
||||||
|
sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
|
||||||
|
|
||||||
|
sumf += d * (float) vaddlvq_s16(sumi0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
__m256 sumf = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
// 16-bit sums, because 256*127 still fits
|
||||||
|
__m256i sumi0 = _mm256_setzero_si256();
|
||||||
|
__m256i sumi1 = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||||
|
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
|
||||||
|
__m256i qx1 = _mm256_srli_epi16(qx0, 2);
|
||||||
|
__m256i qx2 = _mm256_srli_epi16(qx0, 4);
|
||||||
|
__m256i qx3 = _mm256_srli_epi16(qx0, 6);
|
||||||
|
|
||||||
|
// 0, 1, 2 (should not be 3)
|
||||||
|
qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
|
||||||
|
qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
|
||||||
|
qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
|
||||||
|
qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
|
||||||
|
|
||||||
|
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0));
|
||||||
|
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
|
||||||
|
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
|
||||||
|
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
|
||||||
|
|
||||||
|
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
||||||
|
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
||||||
|
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
||||||
|
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
||||||
|
|
||||||
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
||||||
|
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
||||||
|
}
|
||||||
|
|
||||||
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||||
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||||
|
|
||||||
|
sumi0 = _mm256_add_epi16(sumi0, sumi1);
|
||||||
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||||
|
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
||||||
|
|
||||||
|
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(sumf);
|
||||||
|
|
||||||
|
#else
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
int32_t sumi = 0;
|
||||||
|
|
||||||
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||||
|
for (size_t l = 0; l < 4; ++l) {
|
||||||
|
for (size_t k = 0; k < 32; ++k) {
|
||||||
|
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
|
sumf += (float) sumi * d;
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
UNUSED(nrc);
|
UNUSED(nrc);
|
||||||
|
@ -14801,6 +15481,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
{
|
||||||
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq1_0, data, nb);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
|
{
|
||||||
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
|
||||||
|
} break;
|
||||||
case GGML_TYPE_IQ1_S:
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
||||||
|
|
|
@ -26,6 +26,9 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST
|
||||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
|
@ -46,6 +49,9 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
@ -67,6 +73,9 @@ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRI
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
@ -90,6 +99,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
|
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
@ -111,6 +123,9 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
|
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -2480,7 +2480,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||||
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
||||||
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
||||||
for (auto& buffer : descriptor_buffer_infos) {
|
for (auto& buffer : descriptor_buffer_infos) {
|
||||||
std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
||||||
}
|
}
|
||||||
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
||||||
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
||||||
|
|
|
@ -1062,7 +1062,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.ncols = 8,
|
.ncols = 8,
|
||||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||||
}
|
},
|
||||||
|
[GGML_TYPE_TQ1_0] = {
|
||||||
|
.type_name = "tq1_0",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_tq1_0),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = (ggml_to_float_t) dequantize_row_tq1_0,
|
||||||
|
.from_float = quantize_row_tq1_0,
|
||||||
|
.from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref,
|
||||||
|
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
|
[GGML_TYPE_TQ2_0] = {
|
||||||
|
.type_name = "tq2_0",
|
||||||
|
.blck_size = QK_K,
|
||||||
|
.type_size = sizeof(block_tq2_0),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
||||||
|
.from_float = quantize_row_tq2_0,
|
||||||
|
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
||||||
|
.vec_dot = ggml_vec_dot_tq2_0_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
|
@ -9932,6 +9956,8 @@ static void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -10310,6 +10336,8 @@ static void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -10438,6 +10466,8 @@ static void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -13440,6 +13470,8 @@ static void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -13628,6 +13660,8 @@ static void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -13890,6 +13924,8 @@ static void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -14479,6 +14515,8 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -19572,7 +19610,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // posix?
|
#elif defined(__gnu_linux__)
|
||||||
|
// TODO: this may not work on BSD, to be verified
|
||||||
|
|
||||||
static bool ggml_thread_apply_affinity(const bool * mask) {
|
static bool ggml_thread_apply_affinity(const bool * mask) {
|
||||||
cpu_set_t cpuset;
|
cpu_set_t cpuset;
|
||||||
|
@ -19627,6 +19666,18 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else // unsupported platforms
|
||||||
|
|
||||||
|
static bool ggml_thread_apply_affinity(const bool * mask) {
|
||||||
|
UNUSED(mask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_thread_apply_priority(int32_t prio) {
|
||||||
|
UNUSED(prio);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static bool ggml_thread_cpumask_is_valid(const bool * mask) {
|
static bool ggml_thread_cpumask_is_valid(const bool * mask) {
|
||||||
|
@ -21927,6 +21978,8 @@ size_t ggml_quantize_chunk(
|
||||||
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
|
|
@ -201,6 +201,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
||||||
#else
|
#else
|
||||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_VULKAN_SHADER_DEBUG_INFO
|
||||||
|
cmd.push_back("-g");
|
||||||
|
#endif
|
||||||
|
|
||||||
for (const auto& define : defines) {
|
for (const auto& define : defines) {
|
||||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1291,6 +1291,8 @@ class GGMLQuantizationType(IntEnum):
|
||||||
Q4_0_4_4 = 31
|
Q4_0_4_4 = 31
|
||||||
Q4_0_4_8 = 32
|
Q4_0_4_8 = 32
|
||||||
Q4_0_8_8 = 33
|
Q4_0_8_8 = 33
|
||||||
|
TQ1_0 = 34
|
||||||
|
TQ2_0 = 35
|
||||||
|
|
||||||
|
|
||||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||||
|
@ -1335,6 +1337,8 @@ class LlamaFileType(IntEnum):
|
||||||
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||||
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||||
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||||
|
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
||||||
|
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
||||||
|
|
||||||
GUESSED = 1024 # not specified in the model file
|
GUESSED = 1024 # not specified in the model file
|
||||||
|
|
||||||
|
@ -1411,6 +1415,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||||
|
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||||
|
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -574,6 +574,87 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
|
||||||
return (d * q).reshape((n_blocks, QK_K))
|
return (d * q).reshape((n_blocks, QK_K))
|
||||||
|
|
||||||
|
|
||||||
|
class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
|
||||||
|
@classmethod
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||||
|
with np.errstate(divide="ignore"):
|
||||||
|
id = np.where(d == 0, 0, 1 / d)
|
||||||
|
qs = np_roundf(blocks * id)
|
||||||
|
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||||
|
|
||||||
|
qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
|
||||||
|
qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||||
|
qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
|
||||||
|
qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||||
|
qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
|
||||||
|
qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||||
|
qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
|
||||||
|
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||||
|
qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
|
||||||
|
|
||||||
|
qs = qs.astype(np.uint8)
|
||||||
|
d = d.astype(np.float16).view(np.uint8)
|
||||||
|
|
||||||
|
return np.concatenate([qs, d], axis=-1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
|
||||||
|
qh, d = np.hsplit(rest, [QK_K // 64])
|
||||||
|
|
||||||
|
d = d.view(np.float16).astype(np.float32)
|
||||||
|
|
||||||
|
qs0, qs1 = qs[..., :32], qs[..., 32:]
|
||||||
|
qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||||
|
qs0 = qs0.reshape((n_blocks, -1))
|
||||||
|
qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||||
|
qs1 = qs1.reshape((n_blocks, -1))
|
||||||
|
qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||||
|
qh = qh.reshape((n_blocks, -1))
|
||||||
|
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||||
|
qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
|
||||||
|
|
||||||
|
return (d * qs.astype(np.float32))
|
||||||
|
|
||||||
|
|
||||||
|
class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
|
||||||
|
@classmethod
|
||||||
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||||
|
with np.errstate(divide="ignore"):
|
||||||
|
id = np.where(d == 0, 0, 1 / d)
|
||||||
|
qs = np_roundf(blocks * id)
|
||||||
|
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||||
|
|
||||||
|
qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||||
|
qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
|
||||||
|
qs = qs.reshape((n_blocks, -1))
|
||||||
|
|
||||||
|
d = d.astype(np.float16).view(np.uint8)
|
||||||
|
|
||||||
|
return np.concatenate([qs, d], axis=-1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
qs, d = np.hsplit(blocks, [QK_K // 4])
|
||||||
|
|
||||||
|
d = d.view(np.float16).astype(np.float32)
|
||||||
|
|
||||||
|
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||||
|
qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
|
||||||
|
|
||||||
|
return (d * qs.astype(np.float32))
|
||||||
|
|
||||||
|
|
||||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||||
ksigns: bytes = (
|
ksigns: bytes = (
|
||||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||||
|
|
|
@ -66,6 +66,7 @@ class GGMLQuants:
|
||||||
for t in (
|
for t in (
|
||||||
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
||||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||||
|
"tq1_0", "tq2_0",
|
||||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||||
"iq4_nl", "iq4_xs",
|
"iq4_nl", "iq4_xs",
|
||||||
):
|
):
|
||||||
|
|
|
@ -167,6 +167,8 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
|
@ -41,7 +41,7 @@ maxhordelen = 400
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.74"
|
KcppVersion = "1.75"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
guimode = False
|
guimode = False
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
|
|
|
@ -4462,6 +4462,8 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
||||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||||
|
case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
|
||||||
|
case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
|
||||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||||
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
||||||
|
@ -5168,6 +5170,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||||
|
@ -8176,23 +8180,23 @@ static bool llm_load_tensors(
|
||||||
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||||
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_T5:
|
case LLM_ARCH_T5:
|
||||||
|
@ -14253,7 +14257,9 @@ struct llm_build_context {
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
if (model.layers[il].wq_scale) {
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||||
|
}
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
if (model.layers[il].bq) {
|
if (model.layers[il].bq) {
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
@ -14262,7 +14268,9 @@ struct llm_build_context {
|
||||||
|
|
||||||
// B1.K
|
// B1.K
|
||||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
if (model.layers[il].wk_scale) {
|
||||||
|
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||||
|
}
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
if (model.layers[il].bk) {
|
if (model.layers[il].bk) {
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
@ -14271,7 +14279,9 @@ struct llm_build_context {
|
||||||
|
|
||||||
// B1.V
|
// B1.V
|
||||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
if (model.layers[il].wv_scale) {
|
||||||
|
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||||
|
}
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
if (model.layers[il].bv) {
|
if (model.layers[il].bv) {
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
@ -14302,7 +14312,9 @@ struct llm_build_context {
|
||||||
cb(cur, "attn_sub_norm", il);
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
if (model.layers[il].wo_scale) {
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||||
|
}
|
||||||
if (model.layers[il].bo) {
|
if (model.layers[il].bo) {
|
||||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
}
|
}
|
||||||
|
@ -14339,7 +14351,9 @@ struct llm_build_context {
|
||||||
cb(cur, "ffn_sub_norm", il);
|
cb(cur, "ffn_sub_norm", il);
|
||||||
|
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
if (model.layers[il].ffn_down_scale) {
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||||
|
}
|
||||||
cb(cur, "ffn_down", il);
|
cb(cur, "ffn_down", il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
@ -17009,6 +17023,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||||
new_type = GGML_TYPE_Q4_0;
|
new_type = GGML_TYPE_Q4_0;
|
||||||
}
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||||
|
new_type = GGML_TYPE_Q4_K;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||||
|
@ -17208,6 +17225,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
}
|
}
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
@ -17313,6 +17332,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue