mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo
# Conflicts: # .github/workflows/build.yml
This commit is contained in:
commit
82d74ca1a6
10 changed files with 585 additions and 114 deletions
74
llama.cpp
74
llama.cpp
|
@ -24,6 +24,9 @@
|
|||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <initializer_list>
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
|
||||
#define LLAMA_USE_SCRATCH
|
||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||
|
@ -486,6 +489,7 @@ struct llama_file_loader {
|
|||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
case GGML_TYPE_Q4_3:
|
||||
break;
|
||||
default: {
|
||||
throw format("unrecognized tensor type %u\n", shard.type);
|
||||
|
@ -559,6 +563,7 @@ struct llama_file_saver {
|
|||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
case GGML_TYPE_Q4_3:
|
||||
break;
|
||||
default: LLAMA_ASSERT(false);
|
||||
}
|
||||
|
@ -848,6 +853,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
return "mostly Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
@ -1576,15 +1582,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|||
// quantization
|
||||
//
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
||||
ggml_type quantized_type;
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
|
||||
default: throw format("invalid output file type %d\n", ftype);
|
||||
};
|
||||
|
||||
if (nthread <= 0) {
|
||||
nthread = std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
||||
/*vocab_only*/ false));
|
||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||
|
@ -1593,6 +1604,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
size_t total_size_new = 0;
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
std::mutex mutex;
|
||||
|
||||
size_t idx = 0;
|
||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
||||
llama_buffer read_data;
|
||||
|
@ -1611,6 +1625,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// quantize only 2D tensors
|
||||
quantize &= (tensor.ne.size() == 2);
|
||||
|
||||
// uncomment this to keep the output layer in FP16
|
||||
//if (tensor.name == "output.weight") {
|
||||
// quantize = false;
|
||||
//}
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
@ -1646,21 +1665,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_data = work.addr;
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
LLAMA_ASSERT(false);
|
||||
int chunk_size = 32 * 512;
|
||||
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
||||
if (nthread_use < 2) {
|
||||
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
||||
} else {
|
||||
size_t counter = 0;
|
||||
new_size = 0;
|
||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
|
||||
std::vector<int64_t> local_hist;
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
size_t first = counter; counter += chunk_size;
|
||||
if (first >= nelements) {
|
||||
if (!local_hist.empty()) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
size_t last = std::min(nelements, first + chunk_size);
|
||||
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||
}
|
||||
};
|
||||
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
||||
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
||||
compute();
|
||||
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
||||
}
|
||||
|
||||
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
|
@ -1782,9 +1817,10 @@ void llama_free(struct llama_context * ctx) {
|
|||
int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype) {
|
||||
enum llama_ftype ftype,
|
||||
int nthread) {
|
||||
try {
|
||||
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
||||
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
||||
return 0;
|
||||
} catch (const std::string & err) {
|
||||
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
||||
|
@ -1970,7 +2006,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||
base_t = dest_t;
|
||||
}
|
||||
|
||||
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
|
||||
if (ggml_is_quantized(base_t->type)) {
|
||||
if (!warned) {
|
||||
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
"use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue