mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 04:00:53 +00:00
379 lines
13 KiB
C++
379 lines
13 KiB
C++
// quantize.cpp : GGUF requantizer for ACE-Step
|
|
// Reads BF16 GGUF, writes quantized GGUF with mixed-precision K-quant policy.
|
|
// Policy mirrors llama-quantize: important tensors (v_proj, down_proj) get
|
|
// bumped in S/M variants, embed_tokens always Q6_K, norms promoted to F32.
|
|
// Streaming write: one tensor at a time, low memory footprint for small configs.
|
|
//
|
|
// Usage: quantize <input.gguf> <output.gguf> <type>
|
|
// Types: Q2_K Q3_K_S Q3_K_M Q3_K_L Q4_K_S Q4_K_M Q5_K_S Q5_K_M Q6_K Q8_0
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#ifdef _WIN32
|
|
# include <windows.h>
|
|
# define strcasecmp _stricmp
|
|
#else
|
|
# include <sys/mman.h>
|
|
# include <sys/stat.h>
|
|
# include <fcntl.h>
|
|
# include <unistd.h>
|
|
#endif
|
|
|
|
#include "ggml.h"
|
|
#include "gguf.h"
|
|
|
|
// Quant variant: base type + optional bump rules for important tensors
|
|
struct QuantVariant {
|
|
const char * name;
|
|
enum ggml_type base;
|
|
enum ggml_type bump; // type for "important" tensors (or COUNT = no bump)
|
|
enum ggml_type embed; // type for embed_tokens (or COUNT = same as base)
|
|
// bump_mode: 0=none, 1=first N layers, 2=first+last+every 3rd, 3=all important
|
|
int bump_mode;
|
|
int bump_n; // for mode 1: number of layers to bump
|
|
};
|
|
|
|
static const QuantVariant VARIANTS[] = {
|
|
// name base bump embed mode n
|
|
{"Q2_K", GGML_TYPE_Q2_K, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, 1, 4},
|
|
{"Q3_K_S", GGML_TYPE_Q3_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
|
|
{"Q3_K_M", GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 2, 0},
|
|
{"Q3_K_L", GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 3, 0},
|
|
{"Q4_K_S", GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 1, 4},
|
|
{"Q4_K_M", GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_Q6_K, 2, 0},
|
|
{"Q5_K_S", GGML_TYPE_Q5_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
|
|
{"Q5_K_M", GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q6_K, 2, 0},
|
|
{"Q6_K", GGML_TYPE_Q6_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
|
|
{"Q8_0", GGML_TYPE_Q8_0, GGML_TYPE_COUNT, GGML_TYPE_Q8_0, 0, 0},
|
|
};
|
|
|
|
static const QuantVariant * find_variant(const char * s) {
|
|
for (const auto & v : VARIANTS) {
|
|
if (strcasecmp(s, v.name) == 0) return &v;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
// Extract layer index from HF tensor name: model.layers.N.xxx -> N, else -1
|
|
static int extract_layer(const char * name) {
|
|
const char * p = strstr(name, "layers.");
|
|
if (!p) return -1;
|
|
return atoi(p + 7);
|
|
}
|
|
|
|
// Important tensors for S/M: v_proj + down_proj
|
|
static bool is_important_sm(const char * name) {
|
|
return (strstr(name, "v_proj.weight") != nullptr) ||
|
|
(strstr(name, "down_proj.weight") != nullptr);
|
|
}
|
|
|
|
// Important tensors for L: v_proj + down_proj + o_proj
|
|
static bool is_important_l(const char * name) {
|
|
return is_important_sm(name) ||
|
|
(strstr(name, "o_proj.weight") != nullptr);
|
|
}
|
|
|
|
static bool is_embed(const char * name) {
|
|
return strstr(name, "embed_tokens.weight") != nullptr;
|
|
}
|
|
|
|
// Should this tensor be quantized at all?
|
|
static bool should_quantize(const char * name, int n_dims, const char * arch) {
|
|
if (strstr(arch, "vae")) return false;
|
|
if (n_dims < 2) return false;
|
|
if (strstr(arch, "text-enc") && strstr(name, "embed_tokens")) return false;
|
|
if (strstr(name, "silence_latent")) return false;
|
|
if (strstr(name, "scale_shift_table")) return false;
|
|
if (strstr(name, "null_condition_emb")) return false;
|
|
return true;
|
|
}
|
|
|
|
// Decide target type for a single tensor given the variant + layer info
|
|
static enum ggml_type pick_type(const char * name, int n_dims, const char * arch,
|
|
const QuantVariant & v, int n_layers) {
|
|
if (!should_quantize(name, n_dims, arch)) return GGML_TYPE_COUNT;
|
|
|
|
// embed_tokens in LM: use embed type
|
|
if (is_embed(name) && !strstr(arch, "text-enc")) {
|
|
return (v.embed != GGML_TYPE_COUNT) ? v.embed : v.base;
|
|
}
|
|
|
|
// Important tensor bump logic
|
|
bool important = (v.bump_mode == 3) ? is_important_l(name) : is_important_sm(name);
|
|
|
|
if (important && v.bump != GGML_TYPE_COUNT) {
|
|
int layer = extract_layer(name);
|
|
bool bumped = false;
|
|
switch (v.bump_mode) {
|
|
case 1: // first N layers only
|
|
bumped = (layer >= 0 && layer < v.bump_n);
|
|
break;
|
|
case 2: { // M variant: first few + last few + every 3rd
|
|
int ql = n_layers;
|
|
bumped = (layer >= 0) &&
|
|
(layer < ql / 9 || layer >= ql - ql / 7 || layer % 3 == 0);
|
|
break;
|
|
}
|
|
case 3: // L variant: all important tensors (v+down+o_proj)
|
|
bumped = true;
|
|
break;
|
|
}
|
|
if (bumped) return v.bump;
|
|
}
|
|
|
|
return v.base;
|
|
}
|
|
|
|
// Promote 1D tensors (norms/biases) to F32 for precision
|
|
static bool should_promote_f32(int n_dims) {
|
|
return n_dims < 2;
|
|
}
|
|
|
|
// Convert source data to F32
|
|
static bool to_f32(const void * src, float * dst, int64_t n, enum ggml_type type) {
|
|
switch (type) {
|
|
case GGML_TYPE_BF16:
|
|
ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, n);
|
|
return true;
|
|
case GGML_TYPE_F16:
|
|
ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, n);
|
|
return true;
|
|
case GGML_TYPE_F32:
|
|
memcpy(dst, src, (size_t)n * sizeof(float));
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
int main(int argc, char ** argv) {
|
|
if (argc != 4) {
|
|
fprintf(stderr, "Usage: %s <input.gguf> <output.gguf> <type>\n", argv[0]);
|
|
fprintf(stderr, "Types:");
|
|
for (const auto & v : VARIANTS) fprintf(stderr, " %s", v.name);
|
|
fprintf(stderr, "\n");
|
|
return 1;
|
|
}
|
|
|
|
const char * inp_path = argv[1];
|
|
const char * out_path = argv[2];
|
|
const QuantVariant * variant = find_variant(argv[3]);
|
|
|
|
if (!variant) {
|
|
fprintf(stderr, "[Quantize] Unknown type: %s\n", argv[3]);
|
|
return 1;
|
|
}
|
|
|
|
fprintf(stderr, "[Quantize] %s -> %s (%s)\n", inp_path, out_path, variant->name);
|
|
|
|
// Mmap input file
|
|
#ifdef _WIN32
|
|
HANDLE fh = CreateFileA(inp_path, GENERIC_READ, FILE_SHARE_READ, NULL,
|
|
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
|
|
if (fh == INVALID_HANDLE_VALUE) {
|
|
fprintf(stderr, "[Quantize] Failed to open %s\n", inp_path);
|
|
return 1;
|
|
}
|
|
HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
if (!mh) {
|
|
fprintf(stderr, "[Quantize] CreateFileMapping failed %s\n", inp_path);
|
|
CloseHandle(fh);
|
|
return 1;
|
|
}
|
|
void * mapping = MapViewOfFile(mh, FILE_MAP_READ, 0, 0, 0);
|
|
if (!mapping) {
|
|
fprintf(stderr, "[Quantize] MapViewOfFile failed %s\n", inp_path);
|
|
CloseHandle(mh); CloseHandle(fh);
|
|
return 1;
|
|
}
|
|
#else
|
|
int fd = open(inp_path, O_RDONLY);
|
|
if (fd < 0) { perror("open"); return 1; }
|
|
struct stat st;
|
|
fstat(fd, &st);
|
|
size_t file_size = (size_t)st.st_size;
|
|
void * mapping = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
if (mapping == MAP_FAILED) { perror("mmap"); close(fd); return 1; }
|
|
#endif
|
|
|
|
// Parse input GGUF
|
|
struct gguf_init_params params = { /*no_alloc=*/ true, /*ctx=*/ nullptr };
|
|
struct ggml_context * meta = nullptr;
|
|
params.ctx = &meta;
|
|
|
|
struct gguf_context * inp = gguf_init_from_file(inp_path, params);
|
|
if (!inp) {
|
|
fprintf(stderr, "[Quantize] Failed to read %s\n", inp_path);
|
|
#ifdef _WIN32
|
|
UnmapViewOfFile(mapping); CloseHandle(mh); CloseHandle(fh);
|
|
#else
|
|
munmap(mapping, file_size); close(fd);
|
|
#endif
|
|
return 1;
|
|
}
|
|
|
|
const size_t data_off = gguf_get_data_offset(inp);
|
|
const int n_tensors = (int)gguf_get_n_tensors(inp);
|
|
|
|
// Read architecture
|
|
char arch[64] = "unknown";
|
|
{
|
|
int64_t idx = gguf_find_key(inp, "general.architecture");
|
|
if (idx >= 0) {
|
|
const char * s = gguf_get_val_str(inp, (int)idx);
|
|
snprintf(arch, sizeof(arch), "%s", s);
|
|
}
|
|
}
|
|
|
|
// Read block count for bump policy
|
|
int n_layers = 0;
|
|
{
|
|
char key[128];
|
|
snprintf(key, sizeof(key), "%s.block_count", arch);
|
|
int64_t idx = gguf_find_key(inp, key);
|
|
if (idx >= 0) n_layers = (int)gguf_get_val_u32(inp, (int)idx);
|
|
}
|
|
|
|
fprintf(stderr, "[Quantize] Arch=%s Layers=%d\n", arch, n_layers);
|
|
|
|
// Create output GGUF: copy KV metadata
|
|
struct gguf_context * out = gguf_init_empty();
|
|
gguf_set_kv(out, inp);
|
|
gguf_set_val_u32(out, "general.quantization_version", 2);
|
|
gguf_set_val_str(out, "general.file_type", variant->name);
|
|
|
|
// Plan: for each tensor, decide target type
|
|
struct TensorPlan { enum ggml_type target; bool quantize; bool promote; };
|
|
std::vector<TensorPlan> plans((size_t)n_tensors);
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
const char * name = gguf_get_tensor_name(inp, i);
|
|
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
|
const int n_dims = ggml_n_dims(t);
|
|
|
|
gguf_add_tensor(out, t);
|
|
plans[(size_t)i] = {GGML_TYPE_COUNT, false, false};
|
|
|
|
enum ggml_type target = pick_type(name, n_dims, arch, *variant, n_layers);
|
|
|
|
// Promote 1D norms/biases BF16/F16 -> F32
|
|
if (target == GGML_TYPE_COUNT && should_promote_f32(n_dims) &&
|
|
(t->type == GGML_TYPE_BF16 || t->type == GGML_TYPE_F16)) {
|
|
gguf_set_tensor_type(out, name, GGML_TYPE_F32);
|
|
plans[(size_t)i] = {GGML_TYPE_F32, false, true};
|
|
continue;
|
|
}
|
|
|
|
if (target == GGML_TYPE_COUNT) continue;
|
|
|
|
bool can_convert = (t->type == GGML_TYPE_BF16 ||
|
|
t->type == GGML_TYPE_F16 ||
|
|
t->type == GGML_TYPE_F32);
|
|
bool aligned = (t->ne[0] % ggml_blck_size(target) == 0);
|
|
|
|
if (can_convert && aligned) {
|
|
gguf_set_tensor_type(out, name, target);
|
|
plans[(size_t)i] = {target, true, false};
|
|
}
|
|
}
|
|
|
|
// Write metadata only (header + tensor info, no data)
|
|
bool ok = gguf_write_to_file(out, out_path, true);
|
|
if (!ok) {
|
|
fprintf(stderr, "[Quantize] Failed to write metadata %s\n", out_path);
|
|
return 1;
|
|
}
|
|
|
|
// Stream tensor data one at a time (low memory)
|
|
FILE * fout = fopen(out_path, "ab");
|
|
if (!fout) {
|
|
fprintf(stderr, "[Quantize] Failed to open %s for append\n", out_path);
|
|
return 1;
|
|
}
|
|
|
|
const size_t alignment = gguf_get_alignment(out);
|
|
int n_quantized = 0, n_promoted = 0;
|
|
int64_t bytes_in = 0, bytes_out = 0;
|
|
size_t data_pos = 0;
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
const char * name = gguf_get_tensor_name(inp, i);
|
|
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
|
const int64_t nel = ggml_nelements(t);
|
|
const size_t src_size = ggml_nbytes(t);
|
|
const size_t t_off = gguf_get_tensor_offset(inp, i);
|
|
const void * src = (const uint8_t *)mapping + data_off + t_off;
|
|
|
|
bytes_in += (int64_t)src_size;
|
|
|
|
// Pad to alignment boundary
|
|
size_t pad = (alignment - (data_pos % alignment)) % alignment;
|
|
if (pad > 0) {
|
|
uint8_t zeros[64] = {};
|
|
fwrite(zeros, 1, pad, fout);
|
|
data_pos += pad;
|
|
}
|
|
|
|
const TensorPlan & plan = plans[(size_t)i];
|
|
|
|
if (plan.promote) {
|
|
// BF16/F16 -> F32
|
|
std::vector<float> f32((size_t)nel);
|
|
to_f32(src, f32.data(), nel, t->type);
|
|
size_t out_size = (size_t)nel * sizeof(float);
|
|
fwrite(f32.data(), 1, out_size, fout);
|
|
data_pos += out_size;
|
|
bytes_out += (int64_t)out_size;
|
|
n_promoted++;
|
|
} else if (plan.quantize) {
|
|
// Quantize: src -> f32 -> target
|
|
std::vector<float> f32((size_t)nel);
|
|
to_f32(src, f32.data(), nel, t->type);
|
|
|
|
const int64_t n_per_row = t->ne[0];
|
|
const int64_t nrows = nel / n_per_row;
|
|
const size_t qsize = ggml_row_size(plan.target, n_per_row) * (size_t)nrows;
|
|
|
|
std::vector<uint8_t> qbuf(qsize);
|
|
ggml_quantize_chunk(plan.target, f32.data(), qbuf.data(),
|
|
0, nrows, n_per_row, nullptr);
|
|
|
|
fwrite(qbuf.data(), 1, qsize, fout);
|
|
data_pos += qsize;
|
|
bytes_out += (int64_t)qsize;
|
|
n_quantized++;
|
|
} else {
|
|
// Keep as-is
|
|
fwrite(src, 1, src_size, fout);
|
|
data_pos += src_size;
|
|
bytes_out += (int64_t)src_size;
|
|
}
|
|
}
|
|
|
|
fclose(fout);
|
|
|
|
fprintf(stderr, "[Quantize] Quantized %d/%d tensors, promoted %d to F32\n",
|
|
n_quantized, n_tensors, n_promoted);
|
|
fprintf(stderr, "[Quantize] %.1f GB -> %.1f GB (%.1fx)\n",
|
|
(double)bytes_in / 1e9, (double)bytes_out / 1e9,
|
|
bytes_out > 0 ? (double)bytes_in / (double)bytes_out : 0.0);
|
|
fprintf(stderr, "[Quantize] Wrote %s\n", out_path);
|
|
|
|
gguf_free(out);
|
|
gguf_free(inp);
|
|
ggml_free(meta);
|
|
#ifdef _WIN32
|
|
UnmapViewOfFile(mapping);
|
|
CloseHandle(mh);
|
|
CloseHandle(fh);
|
|
#else
|
|
munmap(mapping, file_size);
|
|
close(fd);
|
|
#endif
|
|
|
|
return 0;
|
|
}
|