mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile # README.md # tests/test-quantize-perf.cpp
This commit is contained in:
commit
282376c85a
21 changed files with 2853 additions and 482 deletions
31
llama.cpp
31
llama.cpp
|
@ -21,9 +21,13 @@
|
|||
#endif
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
#ifndef QK_K
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#else
|
||||
#define QK_K 256
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
|
@ -770,7 +774,7 @@ struct llama_model_loader {
|
|||
}
|
||||
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
||||
if (lmlock) {
|
||||
lmlock->init(mapping->addr);
|
||||
}
|
||||
|
@ -973,7 +977,7 @@ bool llama_mlock_supported() {
|
|||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
void llama_init_backend() {
|
||||
void llama_init_backend(bool numa) {
|
||||
ggml_time_init();
|
||||
|
||||
// needed to initialize f16 tables
|
||||
|
@ -982,6 +986,10 @@ void llama_init_backend() {
|
|||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
if (numa) {
|
||||
ggml_numa_init();
|
||||
}
|
||||
}
|
||||
|
||||
int64_t llama_time_us() {
|
||||
|
@ -1483,11 +1491,11 @@ static bool llama_eval_internal(
|
|||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
|
@ -2470,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
std::vector<std::thread> workers;
|
||||
std::mutex mutex;
|
||||
|
||||
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||
};
|
||||
|
||||
size_t idx = 0;
|
||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
||||
llama_buffer read_data;
|
||||
|
@ -2524,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
|
@ -2890,7 +2903,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue