mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md # ggml.h # llama.cpp
This commit is contained in:
commit
cde3760e52
13 changed files with 527 additions and 225 deletions
43
llama.cpp
43
llama.cpp
|
@ -1431,7 +1431,7 @@ static bool llama_eval_internal(
|
|||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
|
@ -1548,8 +1548,8 @@ static bool llama_eval_internal(
|
|||
ggml_set_name(v, "v");
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
|
@ -1719,21 +1719,22 @@ static bool llama_eval_internal(
|
|||
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, cur);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
||||
|
||||
#if GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
||||
ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf);
|
||||
}
|
||||
// TODO: disabled until #2413 is resolved
|
||||
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
||||
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
|
||||
//}
|
||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||
} else {
|
||||
// IMPORTANT:
|
||||
|
@ -1752,34 +1753,34 @@ static bool llama_eval_internal(
|
|||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
|
||||
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
||||
}
|
||||
#else
|
||||
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
||||
#endif
|
||||
|
||||
#if GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
||||
// update kv token count
|
||||
lctx.kv_self.n = n_past + N;
|
||||
|
||||
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
|
||||
if (cgraph_fname) {
|
||||
ggml_graph_export(&gf, cgraph_fname);
|
||||
ggml_graph_export(gf, cgraph_fname);
|
||||
}
|
||||
|
||||
#ifdef GGML_PERF
|
||||
// print timing information per ggml operation (for debugging purposes)
|
||||
// requires GGML_PERF to be defined
|
||||
ggml_graph_print(&gf);
|
||||
ggml_graph_print(gf);
|
||||
#endif
|
||||
|
||||
// plot the computation graph in dot format (for debugging purposes)
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
// extract logits
|
||||
|
@ -1930,7 +1931,9 @@ struct llama_tokenizer {
|
|||
if (token == vocab_.token_to_id.end()) {
|
||||
// output any symbols that did not form tokens as bytes.
|
||||
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
// NOTE: old version, before #2420 - not sure what are the implications of this
|
||||
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
||||
output.push_back(token_id);
|
||||
}
|
||||
} else {
|
||||
|
@ -3186,7 +3189,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type));
|
||||
ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||
|
||||
ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
|
@ -3671,7 +3674,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_embd = hparams.n_embd_gqa();
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const size_t kv_size = kv_self.buf.size;
|
||||
|
@ -3774,7 +3777,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
const auto & kv_self = ctx->kv_self;
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_embd = hparams.n_embd_gqa();
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
size_t kv_size;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue