mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge commit '947538acb8
' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # CMakePresets.json # examples/llama-bench/llama-bench.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-quantize-fns.cpp
This commit is contained in:
commit
70cdb55cc9
29 changed files with 93418 additions and 92397 deletions
|
@ -1679,6 +1679,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "--output-format") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "--no-warmup") {
|
||||
params.warmup = false;
|
||||
return true;
|
||||
|
@ -2069,6 +2077,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||
|
||||
options.push_back({ "batched-bench" });
|
||||
options.push_back({ "batched-bench", " --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
|
||||
for (const auto & o : options) {
|
||||
|
|
|
@ -299,6 +299,9 @@ struct gpt_params {
|
|||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
void gpt_params_parse_from_env(gpt_params & params);
|
||||
|
|
|
@ -308,6 +308,20 @@ class Model:
|
|||
):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
if data_qtype is False and any(
|
||||
self.match_model_tensor_name(new_name, key, bid)
|
||||
for key in (
|
||||
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||
gguf.MODEL_TENSOR.OUTPUT,
|
||||
)
|
||||
):
|
||||
if self.ftype in (
|
||||
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||
):
|
||||
# TODO: use Q4_K and Q6_K
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||
if isinstance(data_qtype, bool):
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
|
@ -318,6 +332,10 @@ class Model:
|
|||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||
else:
|
||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||
|
||||
|
@ -1623,15 +1641,16 @@ class BitnetModel(Model):
|
|||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||
|
||||
def weight_quant(self, weight):
|
||||
def weight_quant(self, weight: Tensor) -> Tensor:
|
||||
dtype = weight.dtype
|
||||
weight = weight.float()
|
||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||
weight = (weight * s).round().clamp(-1, 1) / s
|
||||
scale = weight.abs().max().unsqueeze(0)
|
||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||
weight = torch.sign(weight).type(dtype)
|
||||
return weight.type(dtype), scale.type(torch.float32)
|
||||
scale = weight.abs().mean().clamp(min=1e-5)
|
||||
iscale = 1 / scale
|
||||
# TODO: multiply by the scale directly instead of inverting it twice
|
||||
# (this is also unnecessarily doubly inverted upstream)
|
||||
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
||||
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
||||
return result.type(dtype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
@ -1646,10 +1665,8 @@ class BitnetModel(Model):
|
|||
gguf.MODEL_TENSOR.FFN_GATE,
|
||||
]):
|
||||
# transform weight into 1/0/-1 (in fp32)
|
||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||
yield (new_name, weight_torch)
|
||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||
else:
|
||||
data_torch = self.weight_quant(data_torch)
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
|
@ -4011,8 +4028,8 @@ def parse_args() -> argparse.Namespace:
|
|||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
|
@ -4099,6 +4116,8 @@ def main() -> None:
|
|||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
|
|
|
@ -49,3 +49,12 @@ There are 2 modes of operation:
|
|||
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
||||
|
||||
### JSONL output
|
||||
|
||||
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||
|
||||
```json lines
|
||||
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
|
||||
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
|
||||
```
|
||||
|
|
|
@ -122,12 +122,13 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!params.batched_bench_output_jsonl) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
}
|
||||
|
||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||
|
@ -195,10 +196,19 @@ int main(int argc, char ** argv) {
|
|||
const float speed_tg = pl*tg / t_tg;
|
||||
const float speed = n_kv / t;
|
||||
|
||||
if(params.batched_bench_output_jsonl) {
|
||||
LOG_TEE(
|
||||
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||
);
|
||||
} else {
|
||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
|
|
|
@ -27,6 +27,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
||||
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||
|
|
|
@ -51,15 +51,12 @@ enum stop_type {
|
|||
STOP_TYPE_PARTIAL,
|
||||
};
|
||||
|
||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_PROCESSING,
|
||||
};
|
||||
|
||||
enum slot_command {
|
||||
SLOT_COMMAND_NONE,
|
||||
SLOT_COMMAND_LOAD_PROMPT,
|
||||
SLOT_COMMAND_RELEASE,
|
||||
SLOT_STATE_PROCESSING_PROMPT,
|
||||
SLOT_STATE_DONE_PROMPT,
|
||||
SLOT_STATE_GENERATING,
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
|
@ -136,7 +133,6 @@ struct server_slot {
|
|||
struct slot_params params;
|
||||
|
||||
slot_state state = SLOT_STATE_IDLE;
|
||||
slot_command command = SLOT_COMMAND_NONE;
|
||||
|
||||
// used to determine the slot that has been used the longest
|
||||
int64_t t_last_used = -1;
|
||||
|
@ -195,6 +191,8 @@ struct server_slot {
|
|||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
||||
std::function<void(int)> callback_on_release;
|
||||
|
||||
void reset() {
|
||||
n_prompt_tokens = 0;
|
||||
generated_text = "";
|
||||
|
@ -229,25 +227,28 @@ struct server_slot {
|
|||
return n_remaining > 0; // no budget
|
||||
}
|
||||
|
||||
bool available() const {
|
||||
return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
|
||||
}
|
||||
|
||||
bool is_processing() const {
|
||||
return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
|
||||
return state != SLOT_STATE_IDLE;
|
||||
}
|
||||
|
||||
void add_token_string(const completion_token_output & token) {
|
||||
if (command == SLOT_COMMAND_RELEASE) {
|
||||
if (!is_processing()) {
|
||||
return;
|
||||
}
|
||||
generated_token_probs.push_back(token);
|
||||
}
|
||||
|
||||
void release() {
|
||||
if (state == SLOT_STATE_PROCESSING) {
|
||||
if (is_processing()) {
|
||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||
command = SLOT_COMMAND_RELEASE;
|
||||
state = SLOT_STATE_IDLE;
|
||||
LOG_INFO("slot released", {
|
||||
{"id_slot", id},
|
||||
{"id_task", id_task},
|
||||
{"n_past", n_past},
|
||||
{"truncated", truncated},
|
||||
});
|
||||
callback_on_release(id);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -354,6 +355,9 @@ struct server_metrics {
|
|||
uint64_t n_tokens_predicted = 0;
|
||||
uint64_t t_tokens_generation = 0;
|
||||
|
||||
uint64_t n_decode_total = 0;
|
||||
uint64_t n_busy_slots_total = 0;
|
||||
|
||||
void init() {
|
||||
t_start = ggml_time_us();
|
||||
}
|
||||
|
@ -372,6 +376,15 @@ struct server_metrics {
|
|||
t_tokens_generation_total += slot.t_token_generation;
|
||||
}
|
||||
|
||||
void on_decoded(const std::vector<server_slot> & slots) {
|
||||
n_decode_total++;
|
||||
for (const auto & slot : slots) {
|
||||
if (slot.is_processing()) {
|
||||
n_busy_slots_total++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset_bucket() {
|
||||
n_prompt_tokens_processed = 0;
|
||||
t_prompt_processing = 0;
|
||||
|
@ -413,6 +426,7 @@ struct server_queue {
|
|||
|
||||
// multi-task version of post()
|
||||
int post(std::vector<server_task> & tasks, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : tasks) {
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
|
@ -432,6 +446,7 @@ struct server_queue {
|
|||
void defer(server_task task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
// Get the next id for creating a new task
|
||||
|
@ -452,14 +467,14 @@ struct server_queue {
|
|||
callback_update_slots = std::move(callback);
|
||||
}
|
||||
|
||||
// Call when the state of one slot is changed
|
||||
void notify_slot_changed() {
|
||||
// move deferred tasks back to main loop
|
||||
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
||||
void pop_deferred_task() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : queue_tasks_deferred) {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
if (!queue_tasks_deferred.empty()) {
|
||||
queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
}
|
||||
queue_tasks_deferred.clear();
|
||||
condition_tasks.notify_one();
|
||||
}
|
||||
|
||||
// end the start_loop routine
|
||||
|
@ -489,7 +504,7 @@ struct server_queue {
|
|||
break;
|
||||
}
|
||||
server_task task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
queue_tasks.pop_front();
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
|
||||
callback_new_task(task);
|
||||
|
@ -717,6 +732,10 @@ struct server_context {
|
|||
|
||||
slot.sparams = params.sparams;
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
};
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
|
@ -798,7 +817,7 @@ struct server_context {
|
|||
|
||||
for (server_slot & slot : slots) {
|
||||
// skip the slot if it is not available
|
||||
if (!slot.available()) {
|
||||
if (slot.is_processing()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -840,7 +859,7 @@ struct server_context {
|
|||
int64_t t_last = ggml_time_us();
|
||||
for (server_slot & slot : slots) {
|
||||
// skip the slot if it is not available
|
||||
if (!slot.available()) {
|
||||
if (slot.is_processing()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1078,7 +1097,7 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
slot.command = SLOT_COMMAND_LOAD_PROMPT;
|
||||
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
||||
slot.prompt_tokens.clear();
|
||||
|
||||
LOG_INFO("slot is processing task", {
|
||||
|
@ -1622,7 +1641,7 @@ struct server_context {
|
|||
queue_tasks.defer(task);
|
||||
break;
|
||||
}
|
||||
if (!slot->available()) {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
|
@ -1728,6 +1747,9 @@ struct server_context {
|
|||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||
|
||||
{ "n_decode_total", metrics.n_decode_total},
|
||||
{ "n_busy_slots_total", metrics.n_busy_slots_total},
|
||||
|
||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||
|
||||
|
@ -1747,7 +1769,7 @@ struct server_context {
|
|||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
if (!slot->available()) {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
|
@ -1788,7 +1810,7 @@ struct server_context {
|
|||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
if (!slot->available()) {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
|
@ -1836,7 +1858,7 @@ struct server_context {
|
|||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
if (!slot->available()) {
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
|
||||
queue_tasks.defer(task);
|
||||
|
@ -1876,33 +1898,12 @@ struct server_context {
|
|||
system_prompt_update();
|
||||
}
|
||||
|
||||
// release slots
|
||||
for (auto & slot : slots) {
|
||||
if (slot.command == SLOT_COMMAND_RELEASE) {
|
||||
slot.state = SLOT_STATE_IDLE;
|
||||
slot.command = SLOT_COMMAND_NONE;
|
||||
slot.t_last_used = ggml_time_us();
|
||||
|
||||
LOG_INFO("slot released", {
|
||||
{"id_slot", slot.id},
|
||||
{"id_task", slot.id_task},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_past", slot.n_past},
|
||||
{"n_system_tokens", system_tokens.size()},
|
||||
{"n_cache_tokens", slot.cache_tokens.size()},
|
||||
{"truncated", slot.truncated}
|
||||
});
|
||||
|
||||
queue_tasks.notify_slot_changed();
|
||||
}
|
||||
}
|
||||
|
||||
// check if all slots are idle
|
||||
{
|
||||
bool all_idle = true;
|
||||
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
|
||||
if (slot.is_processing()) {
|
||||
all_idle = false;
|
||||
break;
|
||||
}
|
||||
|
@ -1973,7 +1974,7 @@ struct server_context {
|
|||
|
||||
// frist, add sampled tokens from any ongoing sequences
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state == SLOT_STATE_IDLE) {
|
||||
if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2015,7 +2016,7 @@ struct server_context {
|
|||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & slot : slots) {
|
||||
// this slot still has a prompt to be processed
|
||||
if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT) {
|
||||
auto & prompt_tokens = slot.prompt_tokens;
|
||||
|
||||
// we haven't tokenized the prompt yet - do it now:
|
||||
|
@ -2083,8 +2084,6 @@ struct server_context {
|
|||
{"id_task", slot.id_task}
|
||||
});
|
||||
|
||||
slot.state = SLOT_STATE_PROCESSING;
|
||||
slot.command = SLOT_COMMAND_NONE;
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
|
@ -2094,8 +2093,6 @@ struct server_context {
|
|||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
||||
// this prompt is too large to process - discard it
|
||||
if (slot.n_prompt_tokens > n_ubatch) {
|
||||
slot.state = SLOT_STATE_PROCESSING;
|
||||
slot.command = SLOT_COMMAND_NONE;
|
||||
slot.release();
|
||||
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
||||
continue;
|
||||
|
@ -2253,10 +2250,9 @@ struct server_context {
|
|||
{"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
|
||||
});
|
||||
|
||||
// entire prompt has been processed - start decoding new tokens
|
||||
// entire prompt has been processed
|
||||
if (slot.n_past == slot.n_prompt_tokens) {
|
||||
slot.state = SLOT_STATE_PROCESSING;
|
||||
slot.command = SLOT_COMMAND_NONE;
|
||||
slot.state = SLOT_STATE_DONE_PROMPT;
|
||||
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
|
||||
|
@ -2338,18 +2334,17 @@ struct server_context {
|
|||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
metrics.on_decoded(slots);
|
||||
|
||||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
||||
{"i", i},
|
||||
{"n_batch", ret},
|
||||
{"n_batch", n_batch},
|
||||
{"ret", ret},
|
||||
});
|
||||
for (auto & slot : slots) {
|
||||
slot.state = SLOT_STATE_PROCESSING;
|
||||
slot.command = SLOT_COMMAND_NONE;
|
||||
slot.release();
|
||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
||||
}
|
||||
|
@ -2370,16 +2365,23 @@ struct server_context {
|
|||
}
|
||||
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||
if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
// prompt evaluated for embedding
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT) {
|
||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
||||
// prompt evaluated for embedding
|
||||
send_embedding(slot, batch_view);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
continue; // continue loop of slots
|
||||
} else {
|
||||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
}
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
|
@ -2425,6 +2427,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
|
@ -2705,7 +2708,7 @@ int main(int argc, char ** argv) {
|
|||
task.type = SERVER_TASK_TYPE_METRICS;
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
|
||||
// get the result
|
||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||
|
@ -2737,7 +2740,7 @@ int main(int argc, char ** argv) {
|
|||
task.data.push_back({{"reset_bucket", true}});
|
||||
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_tasks.post(task, true); // high-priority task
|
||||
|
||||
// get the result
|
||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||
|
@ -2751,6 +2754,9 @@ int main(int argc, char ** argv) {
|
|||
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
|
||||
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
|
||||
|
||||
const uint64_t n_decode_total = data.at("n_decode_total");
|
||||
const uint64_t n_busy_slots_total = data.at("n_busy_slots_total");
|
||||
|
||||
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
|
||||
|
||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||
|
@ -2771,6 +2777,14 @@ int main(int argc, char ** argv) {
|
|||
{"name", "tokens_predicted_seconds_total"},
|
||||
{"help", "Predict process time"},
|
||||
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
|
||||
}, {
|
||||
{"name", "n_decode_total"},
|
||||
{"help", "Total number of llama_decode() calls"},
|
||||
{"value", n_decode_total}
|
||||
}, {
|
||||
{"name", "n_busy_slots_per_decode"},
|
||||
{"help", "Average number of busy slots per llama_decode() call"},
|
||||
{"value", (float) n_busy_slots_total / (float) n_decode_total}
|
||||
}}},
|
||||
{"gauge", {{
|
||||
{"name", "prompt_tokens_seconds"},
|
||||
|
@ -2837,7 +2851,7 @@ int main(int argc, char ** argv) {
|
|||
task.data = {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "filepath", filepath }
|
||||
{ "filepath", filepath },
|
||||
};
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
|
@ -2867,7 +2881,7 @@ int main(int argc, char ** argv) {
|
|||
task.data = {
|
||||
{ "id_slot", id_slot },
|
||||
{ "filename", filename },
|
||||
{ "filepath", filepath }
|
||||
{ "filepath", filepath },
|
||||
};
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
|
@ -2945,7 +2959,7 @@ int main(int argc, char ** argv) {
|
|||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params.n_parallel },
|
||||
{ "chat_template", curr_tmpl.c_str() }
|
||||
{ "chat_template", curr_tmpl.c_str() },
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
|
|
|
@ -77,6 +77,35 @@ Feature: Parallel
|
|||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is LLM?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
The sky is blue and I love it.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||
Given a prompt:
|
||||
|
|
|
@ -15,6 +15,7 @@ Feature: Passkey / Self-extend with context shift
|
|||
And <n_junk> as number of junk
|
||||
And <n_predicted> server max tokens to predict
|
||||
And 42 as seed
|
||||
And 0.0 temperature
|
||||
And <n_ctx> KV cache size
|
||||
And 1 slots
|
||||
And <n_ga> group attention factor to extend context size through self-extend
|
||||
|
@ -22,7 +23,8 @@ Feature: Passkey / Self-extend with context shift
|
|||
# Can be override with N_GPU_LAYERS
|
||||
And <ngl> GPU offloaded layers
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
# Higher timeout because the model may need to be downloaded from the internet
|
||||
Then the server is healthy with timeout 120 seconds
|
||||
Given available models
|
||||
Then model 0 is trained on <n_ctx_train> tokens context
|
||||
Given a prefix prompt:
|
||||
|
|
|
@ -202,17 +202,15 @@ def step_start_server(context):
|
|||
time.sleep(0.1)
|
||||
|
||||
|
||||
@step("the server is {expecting_status}")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||
async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30)
|
||||
timeout=timeout)
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30,
|
||||
timeout=timeout,
|
||||
params={'fail_on_no_slot': 1},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0)
|
||||
|
@ -225,6 +223,18 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status: Lite
|
|||
assert False, "unknown status"
|
||||
|
||||
|
||||
@step("the server is {expecting_status} with timeout {timeout:d} seconds")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||
await wait_for_server_status_with_timeout(context, expecting_status, timeout)
|
||||
|
||||
|
||||
@step("the server is {expecting_status}")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||
await wait_for_server_status_with_timeout(context, expecting_status, 30)
|
||||
|
||||
|
||||
@step('all slots are {expected_slot_status_string}')
|
||||
@async_run_until_complete
|
||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||
|
|
|
@ -401,6 +401,8 @@ extern "C" {
|
|||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
|
|
@ -1165,6 +1165,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||
}
|
||||
}
|
||||
|
||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
||||
}
|
||||
|
||||
// graph input
|
||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||
|
@ -1644,7 +1649,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
sched->prev_leaf_backend_ids = tmp;
|
||||
}
|
||||
|
||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
||||
if (sched->graph.size < graph_size) {
|
||||
sched->graph.size = graph_size;
|
||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||
|
@ -1696,6 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
}
|
||||
|
@ -1709,6 +1715,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||
}
|
||||
}
|
||||
|
@ -1719,6 +1726,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -227,6 +227,25 @@ typedef struct {
|
|||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Ternary quantization
|
||||
//
|
||||
|
||||
// 1.6875 bpw
|
||||
typedef struct {
|
||||
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
||||
uint8_t qh[QK_K/64]; // 4 elements per byte
|
||||
ggml_half d;
|
||||
} block_tq1_0;
|
||||
static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
||||
|
||||
// 2.0625 bpw
|
||||
typedef struct {
|
||||
uint8_t qs[QK_K/4]; // 2 bits per element
|
||||
ggml_half d;
|
||||
} block_tq2_0;
|
||||
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
@ -361,6 +380,7 @@ typedef struct {
|
|||
} block_iq3_s;
|
||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||
|
||||
// 1.5625 bpw
|
||||
typedef struct {
|
||||
ggml_half d;
|
||||
uint8_t qs[QK_K/8];
|
||||
|
|
|
@ -2576,10 +2576,17 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (!ptr) {
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||
#endif
|
||||
} else {
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
break;
|
||||
|
@ -2846,6 +2853,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
|
|
|
@ -428,7 +428,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
|
@ -449,9 +452,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -461,7 +463,9 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
}
|
||||
|
||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
return nullptr;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
|
@ -482,8 +486,7 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
|||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -175,7 +175,7 @@ typedef __fp16 ggml_fp16_internal_t;
|
|||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddvq_s16
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
|
@ -185,12 +185,9 @@ typedef __fp16 ggml_fp16_internal_t;
|
|||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
|
|
|
@ -1631,7 +1631,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6
|
|||
// ===================== Helper functions
|
||||
//
|
||||
static inline int nearest_int(float fval) {
|
||||
assert(fval <= 4194303.f);
|
||||
assert(fabsf(fval) <= 4194303.f);
|
||||
float val = fval + 12582912.f;
|
||||
int i; memcpy(&i, &val, sizeof(int));
|
||||
return (i & 0x007fffff) - 0x00400000;
|
||||
|
@ -3307,6 +3307,191 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
|||
return nrow * row_size;
|
||||
}
|
||||
|
||||
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
||||
|
||||
void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
for (int64_t i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK_K; j++) {
|
||||
const float v = x[j];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
// 5 elements per byte, along 32 bytes
|
||||
for (size_t j = 0; j < sizeof(y->qs) - sizeof(y->qs) % 32; j += 32) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = 0;
|
||||
for (size_t n = 0; n < 5; ++n) {
|
||||
int xi = lroundf(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2
|
||||
q *= 3;
|
||||
q += xi;
|
||||
}
|
||||
// ceiling division (243 == pow(3, 5))
|
||||
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||
y[i].qs[j + m] = q;
|
||||
}
|
||||
x += 5*32;
|
||||
}
|
||||
// along 16 bytes
|
||||
for (size_t j = sizeof(y->qs) - sizeof(y->qs) % 32; j < sizeof(y->qs); j += 16) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = 0;
|
||||
for (size_t n = 0; n < 5; ++n) {
|
||||
int xi = lroundf(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2
|
||||
q *= 3;
|
||||
q += xi;
|
||||
}
|
||||
// ceiling division (243 == pow(3, 5))
|
||||
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||
y[i].qs[j + m] = q;
|
||||
}
|
||||
x += 5*16;
|
||||
}
|
||||
// 4 elements per byte
|
||||
for (size_t j = 0; j < sizeof(y->qh); ++j) {
|
||||
uint8_t q = 0;
|
||||
for (size_t m = 0; m < 4; ++m) {
|
||||
// -1, 0, 1 -> 0, 1, 2
|
||||
int xi = lroundf(x[j + m*sizeof(y->qh)] * id) + 1;
|
||||
q *= 3;
|
||||
q += xi;
|
||||
}
|
||||
// shift the first value to the most significant trit
|
||||
q *= 3;
|
||||
// ceiling division (243 == pow(3, 5))
|
||||
q = ((uint16_t)q * 256 + (243 - 1)) / 243;
|
||||
y[i].qh[j] = q;
|
||||
}
|
||||
x += 4*sizeof(y->qh);
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
for (int64_t i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK_K; j++) {
|
||||
const float v = x[j];
|
||||
amax = MAX(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
for (size_t j = 0; j < sizeof(y->qs); j += 32) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = 0;
|
||||
for (size_t n = 0; n < 4; ++n) {
|
||||
// -1, 0, 1 -> 0, 1, 2
|
||||
int xi = lroundf(x[m + n*32] * id) + 1;
|
||||
q += (xi & 3) << (2*n);
|
||||
}
|
||||
y[i].qs[j + m] = q;
|
||||
}
|
||||
x += 4*32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_tq1_0 * restrict y = vy;
|
||||
quantize_row_tq1_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_tq2_0 * restrict y = vy;
|
||||
quantize_row_tq2_0_ref(x, y, k);
|
||||
}
|
||||
|
||||
size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
||||
quantize_row_tq1_0(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
(void)quant_weights; // not used
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
|
||||
quantize_row_tq2_0(src, dst, (int64_t)nrow*n_per_row);
|
||||
return nrow * row_size;
|
||||
}
|
||||
|
||||
|
||||
void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||
|
||||
for (int64_t i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||
for (size_t n = 0; n < 5; ++n) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[n];
|
||||
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
*y++ = (float) (xi - 1) * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||
for (size_t n = 0; n < 5; ++n) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[n];
|
||||
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
*y++ = (float) (xi - 1) * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t n = 0; n < 4; ++n) {
|
||||
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||
uint8_t q = x[i].qh[j] * pow3[n];
|
||||
int16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
*y++ = (float) (xi - 1) * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
for (int64_t i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
int8_t q = (x[i].qs[j + m] >> (l*2)) & 3;
|
||||
*y++ = (float) (q - 1) * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== "True" 2-bit (de)-quantization
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
|
||||
|
@ -5471,6 +5656,501 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_tq1_0 * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float sumf = 0.0f;
|
||||
|
||||
uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
|
||||
|
||||
const uint8x16_t shift = vld1q_u8(k_shift);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
int32x4_t sumi0 = vdupq_n_s32(0);
|
||||
int32x4_t sumi1 = vdupq_n_s32(0);
|
||||
#else
|
||||
int16x8_t sumi0 = vdupq_n_s16(0);
|
||||
int16x8_t sumi1 = vdupq_n_s16(0);
|
||||
#endif
|
||||
|
||||
// first 32 bytes of 5 elements
|
||||
{
|
||||
uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
|
||||
uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
|
||||
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
|
||||
uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
|
||||
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
|
||||
uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
|
||||
uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
|
||||
uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
|
||||
uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
|
||||
uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
|
||||
|
||||
// multiply by 3 and keep the 2 bits above 8 bits
|
||||
int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
|
||||
int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
|
||||
int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
|
||||
int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
|
||||
int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
|
||||
int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
|
||||
int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
|
||||
int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
|
||||
int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
|
||||
int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
|
||||
|
||||
const int8x16_t qy0 = vld1q_s8(y[i].qs + 0);
|
||||
const int8x16_t qy1 = vld1q_s8(y[i].qs + 16);
|
||||
const int8x16_t qy2 = vld1q_s8(y[i].qs + 32);
|
||||
const int8x16_t qy3 = vld1q_s8(y[i].qs + 48);
|
||||
const int8x16_t qy4 = vld1q_s8(y[i].qs + 64);
|
||||
const int8x16_t qy5 = vld1q_s8(y[i].qs + 80);
|
||||
const int8x16_t qy6 = vld1q_s8(y[i].qs + 96);
|
||||
const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
|
||||
const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
|
||||
const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||
sumi0 = vdotq_s32(sumi0, sqx6, qy6);
|
||||
sumi1 = vdotq_s32(sumi1, sqx7, qy7);
|
||||
sumi0 = vdotq_s32(sumi0, sqx8, qy8);
|
||||
sumi1 = vdotq_s32(sumi1, sqx9, qy9);
|
||||
#else
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
|
||||
#endif
|
||||
}
|
||||
|
||||
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
|
||||
{
|
||||
uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
|
||||
uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
|
||||
uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
|
||||
uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
|
||||
uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
|
||||
uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
|
||||
qx5 = vmulq_u8(qx5, shift);
|
||||
|
||||
// multiply by 3 and keep the 2 bits above 8 bits
|
||||
int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
|
||||
int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
|
||||
int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
|
||||
int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
|
||||
int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
|
||||
int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
|
||||
|
||||
const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
|
||||
const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
|
||||
const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
|
||||
const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
|
||||
const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
|
||||
const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||
#else
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||
#endif
|
||||
}
|
||||
|
||||
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||
sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
|
||||
|
||||
sumf += d * (float) vaddvq_s32(sumi0);
|
||||
#else
|
||||
sumi0 = vaddq_s16(sumi0, sumi1);
|
||||
sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
|
||||
|
||||
sumf += d * (float) vaddlvq_s16(sumi0);
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
__m256 sumf = _mm256_setzero_ps();
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// 16-bit sums
|
||||
__m256i sumi0 = _mm256_setzero_si256();
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
|
||||
// first 32 bytes of 5 elements
|
||||
{
|
||||
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
|
||||
// 8-bit multiplies with shifts, masks and adds
|
||||
__m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
|
||||
__m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
|
||||
__m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
|
||||
__m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
|
||||
|
||||
// TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
|
||||
|
||||
// Cancel the +1 from avg so that it behaves like a halving add
|
||||
qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
|
||||
qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
|
||||
qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
|
||||
qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
|
||||
qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
|
||||
// Multiply by 3 and get the top 2 bits
|
||||
qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
|
||||
qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
|
||||
qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
|
||||
qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
|
||||
qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
|
||||
qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
|
||||
qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
|
||||
qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
|
||||
qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
|
||||
qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
|
||||
|
||||
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0));
|
||||
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32));
|
||||
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64));
|
||||
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96));
|
||||
const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
|
||||
|
||||
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
||||
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
||||
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
||||
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
||||
qx4 = _mm256_maddubs_epi16(qx4, qy4);
|
||||
|
||||
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
||||
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
||||
sumi2 = _mm256_add_epi16(sumi2, qx4);
|
||||
}
|
||||
|
||||
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
|
||||
{
|
||||
__m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
|
||||
__m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
|
||||
__m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
|
||||
__m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
|
||||
__m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
|
||||
__m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
|
||||
__m256i qx01 = MM256_SET_M128I(qx1, qx0);
|
||||
__m256i qx23 = MM256_SET_M128I(qx3, qx2);
|
||||
|
||||
// avx2 does not have 8-bit multiplies, so 16-bit it is.
|
||||
qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
|
||||
qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
|
||||
__m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
|
||||
|
||||
__m256i qx45 = MM256_SET_M128I(qx5, qx4);
|
||||
|
||||
// Cancel the +1 from avg so that it behaves like a halving add
|
||||
qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
|
||||
qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
|
||||
qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
|
||||
// Multiply by 3 and get the top 2 bits
|
||||
qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
|
||||
qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
|
||||
qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
|
||||
qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
|
||||
qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
|
||||
qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
|
||||
|
||||
const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
|
||||
const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
|
||||
const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
|
||||
|
||||
qx01 = _mm256_maddubs_epi16(qx01, qy01);
|
||||
qx23 = _mm256_maddubs_epi16(qx23, qy23);
|
||||
qx45 = _mm256_maddubs_epi16(qx45, qy45);
|
||||
|
||||
sumi0 = _mm256_add_epi16(sumi0, qx01);
|
||||
sumi1 = _mm256_add_epi16(sumi1, qx23);
|
||||
sumi2 = _mm256_add_epi16(sumi2, qx45);
|
||||
}
|
||||
|
||||
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||
|
||||
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
|
||||
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
||||
|
||||
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(sumf);
|
||||
|
||||
#else
|
||||
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int sum = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||
uint8_t q = x[i].qh[j] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_tq2_0 * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float sumf = 0.0f;
|
||||
|
||||
const uint8x16_t m3 = vdupq_n_u8(3);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
int32x4_t sumi0 = vdupq_n_s32(0);
|
||||
int32x4_t sumi1 = vdupq_n_s32(0);
|
||||
#else
|
||||
int16x8_t sumi0 = vdupq_n_s16(0);
|
||||
int16x8_t sumi1 = vdupq_n_s16(0);
|
||||
#endif
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
|
||||
uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
|
||||
uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
|
||||
uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
|
||||
uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
|
||||
uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
|
||||
uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
|
||||
uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
|
||||
|
||||
int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
|
||||
int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
|
||||
int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
|
||||
int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
|
||||
int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
|
||||
int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
|
||||
int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
|
||||
int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
|
||||
|
||||
const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 + 0);
|
||||
const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 + 16);
|
||||
const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 + 32);
|
||||
const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 + 48);
|
||||
const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 + 64);
|
||||
const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 + 80);
|
||||
const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 + 96);
|
||||
const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vdotq_s32(sumi0, sqx0, qy0);
|
||||
sumi1 = vdotq_s32(sumi1, sqx1, qy1);
|
||||
sumi0 = vdotq_s32(sumi0, sqx2, qy2);
|
||||
sumi1 = vdotq_s32(sumi1, sqx3, qy3);
|
||||
sumi0 = vdotq_s32(sumi0, sqx4, qy4);
|
||||
sumi1 = vdotq_s32(sumi1, sqx5, qy5);
|
||||
sumi0 = vdotq_s32(sumi0, sqx6, qy6);
|
||||
sumi1 = vdotq_s32(sumi1, sqx7, qy7);
|
||||
#else
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
|
||||
sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
|
||||
sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
|
||||
#endif
|
||||
}
|
||||
|
||||
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||
sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
|
||||
|
||||
sumf += d * (float) vaddvq_s32(sumi0);
|
||||
#else
|
||||
sumi0 = vaddq_s16(sumi0, sumi1);
|
||||
sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
|
||||
|
||||
sumf += d * (float) vaddlvq_s16(sumi0);
|
||||
#endif
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
__m256 sumf = _mm256_setzero_ps();
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// 16-bit sums, because 256*127 still fits
|
||||
__m256i sumi0 = _mm256_setzero_si256();
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
|
||||
__m256i qx1 = _mm256_srli_epi16(qx0, 2);
|
||||
__m256i qx2 = _mm256_srli_epi16(qx0, 4);
|
||||
__m256i qx3 = _mm256_srli_epi16(qx0, 6);
|
||||
|
||||
// 0, 1, 2 (should not be 3)
|
||||
qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
|
||||
qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
|
||||
qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
|
||||
qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
|
||||
|
||||
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0));
|
||||
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
|
||||
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
|
||||
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
|
||||
|
||||
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
||||
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
||||
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
||||
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
||||
|
||||
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
||||
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
||||
}
|
||||
|
||||
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||
|
||||
sumi0 = _mm256_add_epi16(sumi0, sumi1);
|
||||
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
||||
|
||||
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(sumf);
|
||||
|
||||
#else
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int32_t sumi = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t k = 0; k < 32; ++k) {
|
||||
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
@ -14801,6 +15481,14 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_TQ1_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq1_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_TQ2_0:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
||||
|
|
|
@ -26,6 +26,9 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST
|
|||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
|
@ -46,6 +49,9 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
@ -67,6 +73,9 @@ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
@ -90,6 +99,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
@ -111,6 +123,9 @@ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
|
|||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -2480,7 +2480,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
|||
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
||||
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
||||
for (auto& buffer : descriptor_buffer_infos) {
|
||||
std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
||||
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
||||
}
|
||||
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
||||
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
||||
|
|
|
@ -1062,7 +1062,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.ncols = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
}
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.type_name = "tq1_0",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_tq1_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq1_0,
|
||||
.from_float = quantize_row_tq1_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref,
|
||||
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TQ2_0] = {
|
||||
.type_name = "tq2_0",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_tq2_0),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
||||
.from_float = quantize_row_tq2_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
||||
.vec_dot = ggml_vec_dot_tq2_0_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
};
|
||||
|
||||
// For internal test use
|
||||
|
@ -9932,6 +9956,8 @@ static void ggml_compute_forward_add(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -10310,6 +10336,8 @@ static void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -10438,6 +10466,8 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -13440,6 +13470,8 @@ static void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -13628,6 +13660,8 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -13890,6 +13924,8 @@ static void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -14479,6 +14515,8 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
|
@ -19572,7 +19610,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|||
return true;
|
||||
}
|
||||
|
||||
#else // posix?
|
||||
#elif defined(__gnu_linux__)
|
||||
// TODO: this may not work on BSD, to be verified
|
||||
|
||||
static bool ggml_thread_apply_affinity(const bool * mask) {
|
||||
cpu_set_t cpuset;
|
||||
|
@ -19627,6 +19666,18 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
|||
return true;
|
||||
}
|
||||
|
||||
#else // unsupported platforms
|
||||
|
||||
static bool ggml_thread_apply_affinity(const bool * mask) {
|
||||
UNUSED(mask);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
UNUSED(prio);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static bool ggml_thread_cpumask_is_valid(const bool * mask) {
|
||||
|
@ -21927,6 +21978,8 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
|
|
@ -201,6 +201,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
|||
#else
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||
#endif
|
||||
|
||||
#ifdef GGML_VULKAN_SHADER_DEBUG_INFO
|
||||
cmd.push_back("-g");
|
||||
#endif
|
||||
|
||||
for (const auto& define : defines) {
|
||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||
}
|
||||
|
|
|
@ -1291,6 +1291,8 @@ class GGMLQuantizationType(IntEnum):
|
|||
Q4_0_4_4 = 31
|
||||
Q4_0_4_8 = 32
|
||||
Q4_0_8_8 = 33
|
||||
TQ1_0 = 34
|
||||
TQ2_0 = 35
|
||||
|
||||
|
||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||
|
@ -1335,6 +1337,8 @@ class LlamaFileType(IntEnum):
|
|||
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
||||
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
|
@ -1411,6 +1415,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -574,6 +574,87 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
|
|||
return (d * q).reshape((n_blocks, QK_K))
|
||||
|
||||
|
||||
class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
|
||||
|
||||
qs = qs.astype(np.uint8)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
|
||||
qh, d = np.hsplit(rest, [QK_K // 64])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs0, qs1 = qs[..., :32], qs[..., 32:]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = qs0.reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = qs1.reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = qh.reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
|
||||
qs = qs.reshape((n_blocks, -1))
|
||||
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, d = np.hsplit(blocks, [QK_K // 4])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||
ksigns: bytes = (
|
||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||
|
|
|
@ -66,6 +66,7 @@ class GGMLQuants:
|
|||
for t in (
|
||||
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||
"tq1_0", "tq2_0",
|
||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||
"iq4_nl", "iq4_xs",
|
||||
):
|
||||
|
|
|
@ -167,6 +167,8 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
|
@ -41,7 +41,7 @@ maxhordelen = 400
|
|||
modelbusy = threading.Lock()
|
||||
requestsinqueue = 0
|
||||
defaultport = 5001
|
||||
KcppVersion = "1.74"
|
||||
KcppVersion = "1.75"
|
||||
showdebug = True
|
||||
guimode = False
|
||||
showsamplerwarning = True
|
||||
|
|
|
@ -4462,6 +4462,8 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
|
||||
case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
|
||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
||||
|
@ -5168,6 +5170,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
|
@ -8176,23 +8180,23 @@ static bool llm_load_tensors(
|
|||
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
||||
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
||||
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
||||
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
||||
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
||||
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
||||
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_T5:
|
||||
|
@ -14253,7 +14257,9 @@ struct llm_build_context {
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_scale) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
|
@ -14262,7 +14268,9 @@ struct llm_build_context {
|
|||
|
||||
// B1.K
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_scale) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
|
@ -14271,7 +14279,9 @@ struct llm_build_context {
|
|||
|
||||
// B1.V
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_scale) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
|
@ -14302,7 +14312,9 @@ struct llm_build_context {
|
|||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||
if (model.layers[il].wo_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||
}
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
}
|
||||
|
@ -14339,7 +14351,9 @@ struct llm_build_context {
|
|||
cb(cur, "ffn_sub_norm", il);
|
||||
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
|
||||
if (model.layers[il].ffn_down_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
}
|
||||
cb(cur, "ffn_down", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
@ -17009,6 +17023,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
|
||||
|
@ -17208,6 +17225,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
}
|
||||
if (convert_incompatible_tensor) {
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_TQ1_0:
|
||||
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
|
@ -17313,6 +17332,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue