mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'kquant_vocab_fix' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # Makefile # README.md # llama.cpp # tests/CMakeLists.txt # tests/test-grad0.c # tests/test-opt.c
This commit is contained in:
commit
15576bc865
13 changed files with 630 additions and 453 deletions
66
llama.cpp
66
llama.cpp
|
@ -80,6 +80,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|||
(void) tensor;
|
||||
}
|
||||
|
||||
//
|
||||
// ggml helpers
|
||||
//
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
//
|
||||
// memory sizes
|
||||
//
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
|
@ -322,6 +341,9 @@ struct llama_context {
|
|||
// input embedding (1-dimensional array: [n_embd])
|
||||
std::vector<float> embedding;
|
||||
|
||||
// reusable buffer for `struct ggml_graph_plan.work_data`
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
llama_ctx_buffer buf_compute;
|
||||
|
@ -759,7 +781,6 @@ struct llama_model_loader {
|
|||
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// kv cache
|
||||
//
|
||||
|
@ -1267,7 +1288,7 @@ static bool llama_eval_internal(
|
|||
const float * embd,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_threads,
|
||||
int n_threads,
|
||||
const char * cgraph_fname) {
|
||||
|
||||
// // enforce that the first token is BOS
|
||||
|
@ -1306,10 +1327,11 @@ static bool llama_eval_internal(
|
|||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -1593,6 +1615,7 @@ static bool llama_eval_internal(
|
|||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||
} else {
|
||||
|
@ -1612,10 +1635,10 @@ static bool llama_eval_internal(
|
|||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||
}
|
||||
#else
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||
#endif
|
||||
|
||||
if (cgraph_fname) {
|
||||
|
@ -2405,15 +2428,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else {
|
||||
new_type = quantized_type;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
||||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
||||
int nx = tensor.ne.at(0);
|
||||
int ny = tensor.ne.at(1);
|
||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
||||
fprintf(stderr, "Verify before using\n");
|
||||
fprintf(stderr, "========================================================================================\n\n");
|
||||
// throw std::runtime_error("Unsupported tensor size encountered\n");
|
||||
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||
fprintf(stderr, "Q8_0 will be used for this tensor instead.\n");
|
||||
convert_incompatible_tensor = true;
|
||||
}
|
||||
}
|
||||
if (tensor.name == "output.weight") {
|
||||
|
@ -2441,6 +2464,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
if(convert_incompatible_tensor)
|
||||
{
|
||||
new_type = GGML_TYPE_Q8_0; //fall back to Q8_0 instead of just failing.
|
||||
}
|
||||
#endif
|
||||
|
||||
float * f32_data;
|
||||
|
@ -2575,8 +2602,8 @@ void llama_free_model(struct llama_model * model) {
|
|||
}
|
||||
|
||||
struct llama_context * llama_new_context_with_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
|
||||
if (!model) {
|
||||
return nullptr;
|
||||
|
@ -2646,7 +2673,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
#ifdef GGML_USE_METAL
|
||||
if (params.n_gpu_layers > 0) {
|
||||
// this allocates all Metal resources and memory buffers
|
||||
ctx->ctx_metal = ggml_metal_init();
|
||||
ctx->ctx_metal = ggml_metal_init(1);
|
||||
|
||||
void * data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
|
@ -2803,6 +2830,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|||
// read tensors and apply
|
||||
bool warned = false;
|
||||
int n_tensors = 0;
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
|
@ -2967,8 +2997,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
gf.n_threads = n_threads;
|
||||
ggml_graph_compute(lora_ctx, &gf);
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
|
@ -3121,7 +3151,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kout3d->data = out;
|
||||
|
@ -3141,7 +3170,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
@ -3227,7 +3256,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kin3d->data = (void *) inp;
|
||||
|
@ -3247,7 +3275,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue