mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/server.yml # CMakeLists.txt # Makefile # examples/embedding/embedding.cpp # examples/imatrix/imatrix.cpp # examples/llama-bench/llama-bench.cpp # examples/llava/MobileVLM-README.md # examples/parallel/parallel.cpp # examples/perplexity/perplexity.cpp # examples/quantize/CMakeLists.txt # examples/server/README.md # examples/speculative/speculative.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
e44ddf26ef
47 changed files with 117978 additions and 117646 deletions
|
@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||||
params.prompt = value;
|
params.prompt = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--no-perf"},
|
||||||
|
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
|
[](gpt_params & params) {
|
||||||
|
params.no_perf = true;
|
||||||
|
params.sparams.no_perf = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_NO_PERF"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-f", "--file"}, "FNAME",
|
{"-f", "--file"}, "FNAME",
|
||||||
"a file containing the prompt (default: none)",
|
"a file containing the prompt (default: none)",
|
||||||
|
|
|
@ -821,7 +821,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
}
|
}
|
||||||
llama_kv_cache_clear(lctx);
|
llama_kv_cache_clear(lctx);
|
||||||
llama_synchronize(lctx);
|
llama_synchronize(lctx);
|
||||||
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_reset(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model = model;
|
||||||
|
@ -917,6 +917,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||||
|
@ -1829,6 +1830,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||||
|
|
|
@ -120,6 +120,7 @@ struct gpt_sampler_params {
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
@ -242,6 +243,7 @@ struct gpt_params {
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
|
|
|
@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = false; // TODO: control via params
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
auto * result = new gpt_sampler {
|
auto * result = new gpt_sampler {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
|
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||||
// TODO: measure grammar performance
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(gsmpl->chain);
|
||||||
}
|
}
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -626,6 +626,9 @@ class Model:
|
||||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||||
res = "exaone"
|
res = "exaone"
|
||||||
|
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||||
|
# ref: https://huggingface.co/microsoft/phi-2
|
||||||
|
res = "phi-2"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -2771,6 +2774,8 @@ class Rwkv6Model(Model):
|
||||||
self.gguf_writer.add_tokenizer_model("rwkv")
|
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = self.hparams["num_hidden_layers"]
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
|
|
@ -98,6 +98,7 @@ models = [
|
||||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||||
|
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -363,7 +363,13 @@ if __name__ == '__main__':
|
||||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
dest = super().modify_tensors(data_torch, name, bid)
|
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||||
|
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||||
|
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||||
|
# therefore, we ignore them for now
|
||||||
|
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||||
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
|
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
|
|
@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||||
|
|
||||||
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
llama_perf_sampler_print(smpl)
|
||||||
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
llama_perf_context_print(context)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let utf8Count = text.utf8.count
|
let utf8Count = text.utf8.count
|
||||||
|
|
|
@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf->nodes[0]);
|
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||||
|
|
||||||
|
@ -225,7 +225,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
|
@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
|
|
|
@ -226,8 +226,8 @@ static ggml_status compute_piter(
|
||||||
result.eigenvectors.resize(params.n_batch);
|
result.eigenvectors.resize(params.n_batch);
|
||||||
result.distances.resize(params.n_batch);
|
result.distances.resize(params.n_batch);
|
||||||
// get output nodes
|
// get output nodes
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
auto node = gf->nodes[i];
|
auto node = ggml_graph_node(gf, i);
|
||||||
int iter = -1;
|
int iter = -1;
|
||||||
// find b_tensor (without copying data from device)
|
// find b_tensor (without copying data from device)
|
||||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
|
|
@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -370,7 +370,7 @@ struct lora_merge_ctx {
|
||||||
|
|
||||||
// write data to output file
|
// write data to output file
|
||||||
{
|
{
|
||||||
auto result = gf->nodes[gf->n_nodes - 1];
|
auto * result = ggml_graph_node(gf, -1);
|
||||||
size_t len = ggml_nbytes(result);
|
size_t len = ggml_nbytes(result);
|
||||||
if (read_buf.size() < len) {
|
if (read_buf.size() < len) {
|
||||||
read_buf.resize(len);
|
read_buf.resize(len);
|
||||||
|
|
|
@ -2540,7 +2540,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
|
|
@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
|
|
@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
|
|
|
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
|
|
@ -240,8 +240,7 @@ int main(int argc, char ** argv){
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
gpt_perf_print(ctx, smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
|
|
|
@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
|
|
|
@ -3014,12 +3014,39 @@ int main(int argc, char ** argv) {
|
||||||
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
json tokens_response = json::array();
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
const bool add_special = json_value(body, "add_special", false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||||
|
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||||
|
|
||||||
|
if (with_pieces) {
|
||||||
|
for (const auto& token : tokens) {
|
||||||
|
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
||||||
|
json piece_json;
|
||||||
|
|
||||||
|
// Check if the piece is valid UTF-8
|
||||||
|
if (is_valid_utf8(piece)) {
|
||||||
|
piece_json = piece;
|
||||||
|
} else {
|
||||||
|
// If not valid UTF-8, store as array of byte values
|
||||||
|
piece_json = json::array();
|
||||||
|
for (unsigned char c : piece) {
|
||||||
|
piece_json.push_back(static_cast<int>(c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens_response.push_back({
|
||||||
|
{"id", token},
|
||||||
|
{"piece", piece_json}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tokens_response = tokens;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
|
||||||
|
const json data = format_tokenizer_response(tokens_response);
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -105,6 +105,14 @@ Feature: llama.cpp server
|
||||||
Given first token is removed
|
Given first token is removed
|
||||||
Then tokens can be detokenized
|
Then tokens can be detokenized
|
||||||
|
|
||||||
|
Scenario: Tokenize with pieces
|
||||||
|
When tokenizing with pieces:
|
||||||
|
"""
|
||||||
|
What is the capital of Germany?
|
||||||
|
媽
|
||||||
|
"""
|
||||||
|
Then tokens are given with pieces
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
Then 1 models are supported
|
Then 1 models are supported
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
|
||||||
context.tokenize_add_special = True
|
context.tokenize_add_special = True
|
||||||
|
|
||||||
|
|
||||||
|
@step("tokenizing with pieces")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_tokenize_with_pieces(context):
|
||||||
|
context.tokenized_text = context_text(context)
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
|
||||||
|
if getattr(context, "tokenize_add_special", None) is not None:
|
||||||
|
tokenize_args["add_special"] = context.tokenize_add_special
|
||||||
|
|
||||||
|
async with session.post(
|
||||||
|
f"{context.base_url}/tokenize", json=tokenize_args
|
||||||
|
) as response:
|
||||||
|
assert response.status == 200
|
||||||
|
tokenize_json = await response.json()
|
||||||
|
context.tokens_with_pieces = tokenize_json["tokens"]
|
||||||
|
|
||||||
|
|
||||||
|
@step("tokens are given with pieces")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_tokenize_with_pieces(context):
|
||||||
|
# Verify that the response contains both token IDs and pieces
|
||||||
|
assert all(
|
||||||
|
"id" in token and "piece" in token for token in context.tokens_with_pieces
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@step('tokenizing')
|
@step('tokenizing')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_tokenize(context):
|
async def step_tokenize(context):
|
||||||
|
|
|
@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
static bool is_valid_utf8(const std::string & str) {
|
||||||
|
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||||
|
const unsigned char* end = bytes + str.length();
|
||||||
|
|
||||||
|
while (bytes < end) {
|
||||||
|
if (*bytes <= 0x7F) {
|
||||||
|
// 1-byte sequence (0xxxxxxx)
|
||||||
|
bytes++;
|
||||||
|
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||||
|
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 2;
|
||||||
|
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||||
|
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 3;
|
||||||
|
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||||
|
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||||
|
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 4;
|
||||||
|
} else {
|
||||||
|
// Invalid UTF-8 lead byte
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_tokenizer_response(const json & tokens) {
|
||||||
return json {
|
return json {
|
||||||
{"tokens", tokens}
|
{"tokens", tokens}
|
||||||
};
|
};
|
||||||
|
|
|
@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
|
|
@ -4,33 +4,23 @@
|
||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2024 Intel Corporation
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
if [ $# -gt 0 ]; then
|
|
||||||
GGML_SYCL_DEVICE=$1
|
|
||||||
GGML_SYCL_SINGLE_GPU=1
|
|
||||||
else
|
|
||||||
GGML_SYCL_DEVICE=0
|
|
||||||
GGML_SYCL_SINGLE_GPU=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
#export GGML_SYCL_DEBUG=1
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
|
||||||
|
|
||||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||||
|
|
||||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
|
MODEL_FILE=llama-2-7b.Q4_0.gguf
|
||||||
|
NGL=33
|
||||||
|
|
||||||
|
if [ $# -gt 0 ]; then
|
||||||
|
GGML_SYCL_DEVICE=$1
|
||||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||||
#use signle GPU only
|
#use signle GPU only
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
|
|
||||||
else
|
else
|
||||||
#use multiple GPUs with same max compute units
|
#use multiple GPUs with same max compute units
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#use main GPU only
|
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
|
||||||
|
|
||||||
#use multiple GPUs with same max compute units
|
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
|
||||||
|
|
|
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||||
|
*
|
||||||
|
* @return A pointer to the host buffer type interface.
|
||||||
|
*/
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the description of a specific CANN device.
|
* @brief Retrieves the description of a specific CANN device.
|
||||||
*
|
*
|
||||||
|
|
|
@ -364,6 +364,7 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
struct ggml_cgraph;
|
||||||
|
|
||||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
|
@ -581,23 +582,9 @@ extern "C" {
|
||||||
GGML_TENSOR_FLAG_PARAM = 4,
|
GGML_TENSOR_FLAG_PARAM = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
// ggml object
|
|
||||||
struct ggml_object {
|
|
||||||
size_t offs;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
struct ggml_object * next;
|
|
||||||
|
|
||||||
enum ggml_object_type type;
|
|
||||||
|
|
||||||
char padding[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||||
|
|
||||||
|
@ -661,7 +648,7 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
|
@ -677,35 +664,6 @@ extern "C" {
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order {
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
|
||||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef uint32_t ggml_bitset_t;
|
|
||||||
|
|
||||||
struct ggml_hash_set {
|
|
||||||
size_t size;
|
|
||||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
|
||||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
|
||||||
};
|
|
||||||
|
|
||||||
// computation graph
|
|
||||||
struct ggml_cgraph {
|
|
||||||
int size;
|
|
||||||
int n_nodes;
|
|
||||||
int n_leafs;
|
|
||||||
|
|
||||||
struct ggml_tensor ** nodes;
|
|
||||||
struct ggml_tensor ** grads;
|
|
||||||
struct ggml_tensor ** leafs;
|
|
||||||
|
|
||||||
struct ggml_hash_set visited_hash_set;
|
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
|
||||||
};
|
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
|
@ -2023,8 +1981,6 @@ extern "C" {
|
||||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||||
|
|
||||||
#define GGML_N_TASKS_MAX -1
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -2094,30 +2050,35 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||||
|
|
||||||
// graph allocation in a context
|
// graph allocation in a context
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
|
||||||
|
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||||
|
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
|
@ -2515,6 +2476,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
GGML_API int ggml_cpu_has_gpublas (void);
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
|
GGML_API int ggml_cpu_has_riscv_v (void);
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_rpc (void);
|
GGML_API int ggml_cpu_has_rpc (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-cann/aclnn_ops.h"
|
#include "ggml-cann/aclnn_ops.h"
|
||||||
#include "ggml-cann/common.h"
|
#include "ggml-cann/common.h"
|
||||||
|
@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||||
return &ggml_backend_cann_buffer_types[device];
|
return &ggml_backend_cann_buffer_types[device];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Retrieves the name associated with a CANN host buffer type.
|
||||||
|
*
|
||||||
|
* This function returns the descriptive name associated with the specified
|
||||||
|
* CANN host buffer type context.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer type context.
|
||||||
|
* @return Const pointer to the C-style string containing the name.
|
||||||
|
*/
|
||||||
|
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CANN_Host";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Retrieves the name associated with a CANN host buffer.
|
||||||
|
*
|
||||||
|
* This function returns the descriptive name associated with the specified
|
||||||
|
* CANN host buffer context.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer context.
|
||||||
|
* @return Const pointer to the C-style string containing the name.
|
||||||
|
*/
|
||||||
|
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
|
return "CANN_Host";
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Free resources associated with a CANN host buffer.
|
||||||
|
*
|
||||||
|
* This function frees the resources associated with a CANN host buffer, including
|
||||||
|
* its context.
|
||||||
|
*
|
||||||
|
* @param buffer The CANN host buffer to free.
|
||||||
|
*/
|
||||||
|
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
|
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Allocates a new CANN host buffer of the specified size.
|
||||||
|
*
|
||||||
|
* This function allocates a new CANN host buffer with the given size.
|
||||||
|
* @param size Size in bytes of the host buffer to allocate.
|
||||||
|
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
||||||
|
*/
|
||||||
|
static void * ggml_cann_host_malloc(size_t size) {
|
||||||
|
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * hostPtr = nullptr;
|
||||||
|
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||||
|
if (err != ACL_SUCCESS) {
|
||||||
|
|
||||||
|
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
|
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return hostPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Allocates a new CANN host buffer of the specified type and size.
|
||||||
|
*
|
||||||
|
* @param buft Pointer to the host buffer type context.
|
||||||
|
* @param size Size in bytes of the host buffer to allocate.
|
||||||
|
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||||
|
*/
|
||||||
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
void * hostPtr = ggml_cann_host_malloc(size);
|
||||||
|
|
||||||
|
if (hostPtr == nullptr) {
|
||||||
|
// fallback to cpu buffer
|
||||||
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
||||||
|
*
|
||||||
|
* Provides function pointers for allocating, querying properties, and managing
|
||||||
|
* memory for CANN buffer types in the GGML backend.
|
||||||
|
*/
|
||||||
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cann_buffer_type_host;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Computes the forward operation for a given tensor using CANN
|
* @brief Computes the forward operation for a given tensor using CANN
|
||||||
* operations.
|
* operations.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
bool g_mul_mat_q = false;
|
bool g_mul_mat_q = false;
|
||||||
|
|
|
@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order {
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
// bitset
|
// bitset
|
||||||
|
|
||||||
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
|
||||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||||
|
@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||||
|
|
||||||
|
struct ggml_hash_set {
|
||||||
|
size_t size;
|
||||||
|
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||||
|
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||||
|
|
||||||
|
@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computation graph
|
||||||
|
|
||||||
|
struct ggml_cgraph {
|
||||||
|
int size;
|
||||||
|
int n_nodes;
|
||||||
|
int n_leafs;
|
||||||
|
|
||||||
|
struct ggml_tensor ** nodes;
|
||||||
|
struct ggml_tensor ** grads;
|
||||||
|
struct ggml_tensor ** leafs;
|
||||||
|
|
||||||
|
struct ggml_hash_set visited_hash_set;
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order order;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#import "ggml-metal.h"
|
#import "ggml-metal.h"
|
||||||
|
|
||||||
|
#import "ggml-impl.h"
|
||||||
#import "ggml-backend-impl.h"
|
#import "ggml-backend-impl.h"
|
||||||
#import "ggml.h"
|
|
||||||
|
|
||||||
#import <Foundation/Foundation.h>
|
#import <Foundation/Foundation.h>
|
||||||
|
|
||||||
|
@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_nodes = gf->n_nodes;
|
const int n_nodes = gf->n_nodes;
|
||||||
const int n_cb = ctx->n_cb;
|
const int n_cb = ctx->n_cb;
|
||||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
|
||||||
#include "ggml-sycl.h"
|
#include "ggml-sycl.h"
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-sycl/backend.hpp"
|
#include "ggml-sycl/backend.hpp"
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -21,7 +21,7 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-vulkan-shaders.cpp"
|
#include "ggml-vulkan-shaders.cpp"
|
||||||
|
|
120
ggml/src/ggml.c
120
ggml/src/ggml.c
|
@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
|
#define GGML_N_TASKS_MAX (-1)
|
||||||
|
|
||||||
#define GGML_SOFT_MAX_UNROLL 4
|
#define GGML_SOFT_MAX_UNROLL 4
|
||||||
#define GGML_VEC_DOT_UNROLL 2
|
#define GGML_VEC_DOT_UNROLL 2
|
||||||
|
@ -1128,21 +1129,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
#define GGML_F32x4_ADD vaddq_f32
|
#define GGML_F32x4_ADD vaddq_f32
|
||||||
#define GGML_F32x4_MUL vmulq_f32
|
#define GGML_F32x4_MUL vmulq_f32
|
||||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||||
#define GGML_F32x4_REDUCE(res, x) \
|
#define GGML_F32x4_REDUCE(res, x) \
|
||||||
{ \
|
{ \
|
||||||
int offset = GGML_F32_ARR >> 1; \
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
@ -1169,30 +1170,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||||
#define GGML_F16x8_ADD vaddq_f16
|
#define GGML_F16x8_ADD vaddq_f16
|
||||||
#define GGML_F16x8_MUL vmulq_f16
|
#define GGML_F16x8_MUL vmulq_f16
|
||||||
#define GGML_F16x8_REDUCE(res, x) \
|
#define GGML_F16x8_REDUCE(res, x) \
|
||||||
do { \
|
do { \
|
||||||
int offset = GGML_F16_ARR >> 1; \
|
int offset = GGML_F16_ARR >> 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
offset >>= 1; \
|
offset >>= 1; \
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define GGML_F16_VEC GGML_F16x8
|
#define GGML_F16_VEC GGML_F16x8
|
||||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||||
|
@ -1901,6 +1902,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml object
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_object {
|
||||||
|
size_t offs;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
struct ggml_object * next;
|
||||||
|
|
||||||
|
enum ggml_object_type type;
|
||||||
|
|
||||||
|
char padding[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||||
|
|
||||||
//
|
//
|
||||||
// ggml context
|
// ggml context
|
||||||
//
|
//
|
||||||
|
@ -19215,6 +19233,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||||
|
if (i < 0) {
|
||||||
|
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||||
|
return cgraph->nodes[cgraph->n_nodes + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(i < cgraph->n_nodes);
|
||||||
|
return cgraph->nodes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||||
|
return cgraph->n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||||
|
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||||
|
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||||
|
cgraph->n_nodes++;
|
||||||
|
}
|
||||||
|
|
||||||
// Android's libc implementation "bionic" does not support setting affinity
|
// Android's libc implementation "bionic" does not support setting affinity
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
static void set_numa_thread_affinity(int thread_n) {
|
static void set_numa_thread_affinity(int thread_n) {
|
||||||
|
@ -23345,6 +23391,14 @@ int ggml_cpu_has_arm_fma(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_riscv_v(void) {
|
||||||
|
#if defined(__riscv_v_intrinsic)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_metal(void) {
|
int ggml_cpu_has_metal(void) {
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -2388,7 +2388,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
|
|
||||||
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
llama_perf_reset(llama_ctx_v4, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_reset(llama_ctx_v4);
|
||||||
}
|
}
|
||||||
|
|
||||||
generation_finished = false; // Set current generation status
|
generation_finished = false; // Set current generation status
|
||||||
|
@ -3317,7 +3317,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_perf_print(llama_ctx_v4, LLAMA_PERF_TYPE_CONTEXT);
|
llama_perf_context_print(llama_ctx_v4);
|
||||||
}
|
}
|
||||||
|
|
||||||
time2 = timer_check();
|
time2 = timer_check();
|
||||||
|
|
|
@ -343,7 +343,7 @@ extern "C" {
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
//bool no_perf; // whether to measure performance timings, TODO: implement
|
bool no_perf; // whether to measure performance timings
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
|
@ -1058,6 +1058,9 @@ extern "C" {
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||||
|
|
||||||
|
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
||||||
|
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
||||||
|
|
||||||
// available samplers:
|
// available samplers:
|
||||||
|
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
||||||
|
@ -1175,13 +1178,30 @@ extern "C" {
|
||||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_perf_type {
|
struct llama_perf_context_data {
|
||||||
LLAMA_PERF_TYPE_CONTEXT = 0,
|
double t_start_ms;
|
||||||
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
double t_load_ms;
|
||||||
|
double t_p_eval_ms;
|
||||||
|
double t_eval_ms;
|
||||||
|
|
||||||
|
int32_t n_p_eval;
|
||||||
|
int32_t n_eval;
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
struct llama_perf_sampler_data {
|
||||||
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
double t_sample_ms;
|
||||||
|
|
||||||
|
int32_t n_sample;
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||||
|
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
||||||
|
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
||||||
|
|
||||||
|
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
||||||
|
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
||||||
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
|
|
|
@ -891,7 +891,7 @@ public:
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (output != NULL) {
|
if (output != NULL) {
|
||||||
auto result = gf->nodes[gf->n_nodes - 1];
|
auto result = ggml_graph_node(gf, -1);
|
||||||
if (*output == NULL && output_ctx != NULL) {
|
if (*output == NULL && output_ctx != NULL) {
|
||||||
*output = ggml_dup_tensor(output_ctx, result);
|
*output = ggml_dup_tensor(output_ctx, result);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2802,7 +2802,7 @@ static bool whisper_decode_internal(
|
||||||
ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
|
ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
logits = gf->nodes[gf->n_nodes - 1];
|
logits = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -349,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
|
||||||
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
||||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
if (i < 0 || i >= (int32_t) p->samplers.size()) {
|
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return p->samplers[i];
|
return p->samplers[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
||||||
|
auto * p = (llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * result = p->samplers[i];
|
||||||
|
p->samplers.erase(p->samplers.begin() + i);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
||||||
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
@ -1656,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||||
|
|
||||||
return LLAMA_DEFAULT_SEED;
|
return LLAMA_DEFAULT_SEED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// perf
|
||||||
|
|
||||||
|
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
||||||
|
struct llama_perf_sampler_data data = {};
|
||||||
|
|
||||||
|
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||||
|
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
|
||||||
|
data.n_sample = std::max(0, ctx->n_sample);
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
||||||
|
const auto data = llama_perf_sampler(chain);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
|
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||||
|
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
||||||
|
GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
||||||
|
|
||||||
|
ctx->t_sample_us = ctx->n_sample = 0;
|
||||||
|
}
|
||||||
|
|
130
src/llama.cpp
130
src/llama.cpp
|
@ -2170,6 +2170,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||||
if (host_buffer) {
|
if (host_buffer) {
|
||||||
buft = ggml_backend_sycl_host_buffer_type();
|
buft = ggml_backend_sycl_host_buffer_type();
|
||||||
}
|
}
|
||||||
|
#elif defined(GGML_USE_CANN)
|
||||||
|
if (host_buffer) {
|
||||||
|
buft = ggml_backend_cann_host_buffer_type();
|
||||||
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
@ -2496,6 +2500,7 @@ struct llama_cparams {
|
||||||
bool causal_attn;
|
bool causal_attn;
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
|
bool no_perf;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
@ -6707,8 +6712,6 @@ static bool llm_load_tensors(
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
model.t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
model.split_mode = split_mode;
|
model.split_mode = split_mode;
|
||||||
|
@ -8648,14 +8651,13 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
|
||||||
// we take page faults deferred by mmap() into consideration
|
|
||||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||||
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
||||||
|
|
||||||
|
@ -8717,6 +8719,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loading time will be recalculate after the first eval, so
|
||||||
|
// we take page faults deferred by mmap() into consideration
|
||||||
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9936,8 +9942,8 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
// find result_norm tensor for input
|
// find result_norm tensor for input
|
||||||
struct ggml_tensor * inp = nullptr;
|
struct ggml_tensor * inp = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
inp = gf->nodes[i];
|
inp = ggml_graph_node(gf, i);
|
||||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
@ -16284,8 +16290,8 @@ static int llama_decode_internal(
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||||
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||||
|
|
||||||
if (lctx.n_outputs == 0) {
|
if (lctx.n_outputs == 0) {
|
||||||
// no output
|
// no output
|
||||||
|
@ -16294,9 +16300,9 @@ static int llama_decode_internal(
|
||||||
} else if (cparams.embeddings) {
|
} else if (cparams.embeddings) {
|
||||||
res = nullptr; // do not extract logits for embedding case
|
res = nullptr; // do not extract logits for embedding case
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||||
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||||
embd = gf->nodes[i];
|
embd = ggml_graph_node(gf, i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16513,15 +16519,15 @@ static int llama_encode_internal(
|
||||||
// there are two cases here
|
// there are two cases here
|
||||||
if (llama_model_has_decoder(&lctx.model)) {
|
if (llama_model_has_decoder(&lctx.model)) {
|
||||||
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
||||||
} else {
|
} else {
|
||||||
// second case is an encoder-only T5 model
|
// second case is an encoder-only T5 model
|
||||||
if (cparams.embeddings) {
|
if (cparams.embeddings) {
|
||||||
// only output embeddings if required
|
// only output embeddings if required
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
embd = ggml_graph_node(gf, -1);
|
||||||
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
embd = gf->nodes[gf->n_nodes - 2];
|
embd = ggml_graph_node(gf, -2);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
}
|
}
|
||||||
|
@ -18022,6 +18028,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
|
/*.no_perf =*/ true,
|
||||||
/*.abort_callback =*/ nullptr,
|
/*.abort_callback =*/ nullptr,
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
};
|
};
|
||||||
|
@ -18218,6 +18225,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.embeddings = params.embeddings;
|
cparams.embeddings = params.embeddings;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.flash_attn = params.flash_attn;
|
cparams.flash_attn = params.flash_attn;
|
||||||
|
cparams.no_perf = params.no_perf;
|
||||||
cparams.pooling_type = params.pooling_type;
|
cparams.pooling_type = params.pooling_type;
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||||
|
@ -18555,7 +18563,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
|
||||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -20146,10 +20154,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
||||||
|
|
||||||
// add the evaluation to the stats
|
// add the evaluation to the stats
|
||||||
if (ctx->n_queued_tokens == 1) {
|
if (ctx->n_queued_tokens == 1) {
|
||||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
if (!ctx->cparams.no_perf) {
|
||||||
|
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||||
|
}
|
||||||
ctx->n_eval++;
|
ctx->n_eval++;
|
||||||
} else if (ctx->n_queued_tokens > 1) {
|
} else if (ctx->n_queued_tokens > 1) {
|
||||||
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
if (!ctx->cparams.no_perf) {
|
||||||
|
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||||
|
}
|
||||||
ctx->n_p_eval += ctx->n_queued_tokens;
|
ctx->n_p_eval += ctx->n_queued_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20745,6 +20757,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||||
|
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||||
|
@ -20756,65 +20769,40 @@ const char * llama_print_system_info(void) {
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
|
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
||||||
switch (type) {
|
struct llama_perf_context_data data = {};
|
||||||
case LLAMA_PERF_TYPE_CONTEXT:
|
|
||||||
{
|
|
||||||
const auto * p = (const struct llama_context *) ctx;
|
|
||||||
|
|
||||||
const double t_start_ms = 1e-3 * p->t_start_us;
|
if (ctx == nullptr) {
|
||||||
const double t_end_ms = 1.00 * ggml_time_ms();
|
return data;
|
||||||
const double t_load_ms = 1e-3 * p->t_load_us;
|
|
||||||
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
|
||||||
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
|
||||||
|
|
||||||
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
|
||||||
const int32_t n_eval = std::max(1, p->n_eval);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
|
||||||
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
|
||||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
|
||||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
|
||||||
} break;
|
|
||||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
|
||||||
{
|
|
||||||
const auto * smpl = (const struct llama_sampler *) ctx;
|
|
||||||
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
|
||||||
|
|
||||||
const double t_sampler_ms = 1e-3 * p->t_sample_us;
|
|
||||||
|
|
||||||
const int32_t n_sampler = std::max(0, p->n_sample);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
||||||
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
|
||||||
} break;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("invalid perf type");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
||||||
|
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
||||||
|
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
||||||
|
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
||||||
|
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
||||||
|
data.n_eval = std::max(1, ctx->n_eval);
|
||||||
|
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_reset(void * ctx, enum llama_perf_type type) {
|
void llama_perf_context_print(const struct llama_context * ctx) {
|
||||||
switch (type) {
|
const auto data = llama_perf_context(ctx);
|
||||||
case LLAMA_PERF_TYPE_CONTEXT:
|
|
||||||
{
|
|
||||||
auto * p = (struct llama_context *) ctx;
|
|
||||||
|
|
||||||
p->t_start_us = ggml_time_us();
|
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||||
p->t_eval_us = p->n_eval = 0;
|
|
||||||
p->t_p_eval_us = p->n_p_eval = 0;
|
|
||||||
} break;
|
|
||||||
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
|
||||||
{
|
|
||||||
auto * smpl = (struct llama_sampler *) ctx;
|
|
||||||
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
|
||||||
|
|
||||||
p->t_sample_us = p->n_sample = 0;
|
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||||
} break;
|
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
default:
|
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||||
GGML_ABORT("invalid perf type");
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
}
|
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||||
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_perf_context_reset(struct llama_context * ctx) {
|
||||||
|
ctx->t_start_us = ggml_time_us();
|
||||||
|
ctx->t_eval_us = ctx->n_eval = 0;
|
||||||
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue