koboldcpp/otherarch/acestep/quantize-acestep.cpp
2026-02-22 13:22:24 +08:00

379 lines
13 KiB
C++

// quantize.cpp : GGUF requantizer for ACE-Step
// Reads BF16 GGUF, writes quantized GGUF with mixed-precision K-quant policy.
// Policy mirrors llama-quantize: important tensors (v_proj, down_proj) get
// bumped in S/M variants, embed_tokens always Q6_K, norms promoted to F32.
// Streaming write: one tensor at a time, low memory footprint for small configs.
//
// Usage: quantize <input.gguf> <output.gguf> <type>
// Types: Q2_K Q3_K_S Q3_K_M Q3_K_L Q4_K_S Q4_K_M Q5_K_S Q5_K_M Q6_K Q8_0
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
#ifdef _WIN32
# include <windows.h>
# define strcasecmp _stricmp
#else
# include <sys/mman.h>
# include <sys/stat.h>
# include <fcntl.h>
# include <unistd.h>
#endif
#include "ggml.h"
#include "gguf.h"
// Quant variant: base type + optional bump rules for important tensors
struct QuantVariant {
const char * name;
enum ggml_type base;
enum ggml_type bump; // type for "important" tensors (or COUNT = no bump)
enum ggml_type embed; // type for embed_tokens (or COUNT = same as base)
// bump_mode: 0=none, 1=first N layers, 2=first+last+every 3rd, 3=all important
int bump_mode;
int bump_n; // for mode 1: number of layers to bump
};
static const QuantVariant VARIANTS[] = {
// name base bump embed mode n
{"Q2_K", GGML_TYPE_Q2_K, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, 1, 4},
{"Q3_K_S", GGML_TYPE_Q3_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
{"Q3_K_M", GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 2, 0},
{"Q3_K_L", GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 3, 0},
{"Q4_K_S", GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, 1, 4},
{"Q4_K_M", GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_Q6_K, 2, 0},
{"Q5_K_S", GGML_TYPE_Q5_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
{"Q5_K_M", GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q6_K, 2, 0},
{"Q6_K", GGML_TYPE_Q6_K, GGML_TYPE_COUNT, GGML_TYPE_Q6_K, 0, 0},
{"Q8_0", GGML_TYPE_Q8_0, GGML_TYPE_COUNT, GGML_TYPE_Q8_0, 0, 0},
};
static const QuantVariant * find_variant(const char * s) {
for (const auto & v : VARIANTS) {
if (strcasecmp(s, v.name) == 0) return &v;
}
return nullptr;
}
// Extract layer index from HF tensor name: model.layers.N.xxx -> N, else -1
static int extract_layer(const char * name) {
const char * p = strstr(name, "layers.");
if (!p) return -1;
return atoi(p + 7);
}
// Important tensors for S/M: v_proj + down_proj
static bool is_important_sm(const char * name) {
return (strstr(name, "v_proj.weight") != nullptr) ||
(strstr(name, "down_proj.weight") != nullptr);
}
// Important tensors for L: v_proj + down_proj + o_proj
static bool is_important_l(const char * name) {
return is_important_sm(name) ||
(strstr(name, "o_proj.weight") != nullptr);
}
static bool is_embed(const char * name) {
return strstr(name, "embed_tokens.weight") != nullptr;
}
// Should this tensor be quantized at all?
static bool should_quantize(const char * name, int n_dims, const char * arch) {
if (strstr(arch, "vae")) return false;
if (n_dims < 2) return false;
if (strstr(arch, "text-enc") && strstr(name, "embed_tokens")) return false;
if (strstr(name, "silence_latent")) return false;
if (strstr(name, "scale_shift_table")) return false;
if (strstr(name, "null_condition_emb")) return false;
return true;
}
// Decide target type for a single tensor given the variant + layer info
static enum ggml_type pick_type(const char * name, int n_dims, const char * arch,
const QuantVariant & v, int n_layers) {
if (!should_quantize(name, n_dims, arch)) return GGML_TYPE_COUNT;
// embed_tokens in LM: use embed type
if (is_embed(name) && !strstr(arch, "text-enc")) {
return (v.embed != GGML_TYPE_COUNT) ? v.embed : v.base;
}
// Important tensor bump logic
bool important = (v.bump_mode == 3) ? is_important_l(name) : is_important_sm(name);
if (important && v.bump != GGML_TYPE_COUNT) {
int layer = extract_layer(name);
bool bumped = false;
switch (v.bump_mode) {
case 1: // first N layers only
bumped = (layer >= 0 && layer < v.bump_n);
break;
case 2: { // M variant: first few + last few + every 3rd
int ql = n_layers;
bumped = (layer >= 0) &&
(layer < ql / 9 || layer >= ql - ql / 7 || layer % 3 == 0);
break;
}
case 3: // L variant: all important tensors (v+down+o_proj)
bumped = true;
break;
}
if (bumped) return v.bump;
}
return v.base;
}
// Promote 1D tensors (norms/biases) to F32 for precision
static bool should_promote_f32(int n_dims) {
return n_dims < 2;
}
// Convert source data to F32
static bool to_f32(const void * src, float * dst, int64_t n, enum ggml_type type) {
switch (type) {
case GGML_TYPE_BF16:
ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, n);
return true;
case GGML_TYPE_F16:
ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, n);
return true;
case GGML_TYPE_F32:
memcpy(dst, src, (size_t)n * sizeof(float));
return true;
default:
return false;
}
}
int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "Usage: %s <input.gguf> <output.gguf> <type>\n", argv[0]);
fprintf(stderr, "Types:");
for (const auto & v : VARIANTS) fprintf(stderr, " %s", v.name);
fprintf(stderr, "\n");
return 1;
}
const char * inp_path = argv[1];
const char * out_path = argv[2];
const QuantVariant * variant = find_variant(argv[3]);
if (!variant) {
fprintf(stderr, "[Quantize] Unknown type: %s\n", argv[3]);
return 1;
}
fprintf(stderr, "[Quantize] %s -> %s (%s)\n", inp_path, out_path, variant->name);
// Mmap input file
#ifdef _WIN32
HANDLE fh = CreateFileA(inp_path, GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (fh == INVALID_HANDLE_VALUE) {
fprintf(stderr, "[Quantize] Failed to open %s\n", inp_path);
return 1;
}
HANDLE mh = CreateFileMappingA(fh, NULL, PAGE_READONLY, 0, 0, NULL);
if (!mh) {
fprintf(stderr, "[Quantize] CreateFileMapping failed %s\n", inp_path);
CloseHandle(fh);
return 1;
}
void * mapping = MapViewOfFile(mh, FILE_MAP_READ, 0, 0, 0);
if (!mapping) {
fprintf(stderr, "[Quantize] MapViewOfFile failed %s\n", inp_path);
CloseHandle(mh); CloseHandle(fh);
return 1;
}
#else
int fd = open(inp_path, O_RDONLY);
if (fd < 0) { perror("open"); return 1; }
struct stat st;
fstat(fd, &st);
size_t file_size = (size_t)st.st_size;
void * mapping = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapping == MAP_FAILED) { perror("mmap"); close(fd); return 1; }
#endif
// Parse input GGUF
struct gguf_init_params params = { /*no_alloc=*/ true, /*ctx=*/ nullptr };
struct ggml_context * meta = nullptr;
params.ctx = &meta;
struct gguf_context * inp = gguf_init_from_file(inp_path, params);
if (!inp) {
fprintf(stderr, "[Quantize] Failed to read %s\n", inp_path);
#ifdef _WIN32
UnmapViewOfFile(mapping); CloseHandle(mh); CloseHandle(fh);
#else
munmap(mapping, file_size); close(fd);
#endif
return 1;
}
const size_t data_off = gguf_get_data_offset(inp);
const int n_tensors = (int)gguf_get_n_tensors(inp);
// Read architecture
char arch[64] = "unknown";
{
int64_t idx = gguf_find_key(inp, "general.architecture");
if (idx >= 0) {
const char * s = gguf_get_val_str(inp, (int)idx);
snprintf(arch, sizeof(arch), "%s", s);
}
}
// Read block count for bump policy
int n_layers = 0;
{
char key[128];
snprintf(key, sizeof(key), "%s.block_count", arch);
int64_t idx = gguf_find_key(inp, key);
if (idx >= 0) n_layers = (int)gguf_get_val_u32(inp, (int)idx);
}
fprintf(stderr, "[Quantize] Arch=%s Layers=%d\n", arch, n_layers);
// Create output GGUF: copy KV metadata
struct gguf_context * out = gguf_init_empty();
gguf_set_kv(out, inp);
gguf_set_val_u32(out, "general.quantization_version", 2);
gguf_set_val_str(out, "general.file_type", variant->name);
// Plan: for each tensor, decide target type
struct TensorPlan { enum ggml_type target; bool quantize; bool promote; };
std::vector<TensorPlan> plans((size_t)n_tensors);
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(inp, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name);
const int n_dims = ggml_n_dims(t);
gguf_add_tensor(out, t);
plans[(size_t)i] = {GGML_TYPE_COUNT, false, false};
enum ggml_type target = pick_type(name, n_dims, arch, *variant, n_layers);
// Promote 1D norms/biases BF16/F16 -> F32
if (target == GGML_TYPE_COUNT && should_promote_f32(n_dims) &&
(t->type == GGML_TYPE_BF16 || t->type == GGML_TYPE_F16)) {
gguf_set_tensor_type(out, name, GGML_TYPE_F32);
plans[(size_t)i] = {GGML_TYPE_F32, false, true};
continue;
}
if (target == GGML_TYPE_COUNT) continue;
bool can_convert = (t->type == GGML_TYPE_BF16 ||
t->type == GGML_TYPE_F16 ||
t->type == GGML_TYPE_F32);
bool aligned = (t->ne[0] % ggml_blck_size(target) == 0);
if (can_convert && aligned) {
gguf_set_tensor_type(out, name, target);
plans[(size_t)i] = {target, true, false};
}
}
// Write metadata only (header + tensor info, no data)
bool ok = gguf_write_to_file(out, out_path, true);
if (!ok) {
fprintf(stderr, "[Quantize] Failed to write metadata %s\n", out_path);
return 1;
}
// Stream tensor data one at a time (low memory)
FILE * fout = fopen(out_path, "ab");
if (!fout) {
fprintf(stderr, "[Quantize] Failed to open %s for append\n", out_path);
return 1;
}
const size_t alignment = gguf_get_alignment(out);
int n_quantized = 0, n_promoted = 0;
int64_t bytes_in = 0, bytes_out = 0;
size_t data_pos = 0;
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(inp, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name);
const int64_t nel = ggml_nelements(t);
const size_t src_size = ggml_nbytes(t);
const size_t t_off = gguf_get_tensor_offset(inp, i);
const void * src = (const uint8_t *)mapping + data_off + t_off;
bytes_in += (int64_t)src_size;
// Pad to alignment boundary
size_t pad = (alignment - (data_pos % alignment)) % alignment;
if (pad > 0) {
uint8_t zeros[64] = {};
fwrite(zeros, 1, pad, fout);
data_pos += pad;
}
const TensorPlan & plan = plans[(size_t)i];
if (plan.promote) {
// BF16/F16 -> F32
std::vector<float> f32((size_t)nel);
to_f32(src, f32.data(), nel, t->type);
size_t out_size = (size_t)nel * sizeof(float);
fwrite(f32.data(), 1, out_size, fout);
data_pos += out_size;
bytes_out += (int64_t)out_size;
n_promoted++;
} else if (plan.quantize) {
// Quantize: src -> f32 -> target
std::vector<float> f32((size_t)nel);
to_f32(src, f32.data(), nel, t->type);
const int64_t n_per_row = t->ne[0];
const int64_t nrows = nel / n_per_row;
const size_t qsize = ggml_row_size(plan.target, n_per_row) * (size_t)nrows;
std::vector<uint8_t> qbuf(qsize);
ggml_quantize_chunk(plan.target, f32.data(), qbuf.data(),
0, nrows, n_per_row, nullptr);
fwrite(qbuf.data(), 1, qsize, fout);
data_pos += qsize;
bytes_out += (int64_t)qsize;
n_quantized++;
} else {
// Keep as-is
fwrite(src, 1, src_size, fout);
data_pos += src_size;
bytes_out += (int64_t)src_size;
}
}
fclose(fout);
fprintf(stderr, "[Quantize] Quantized %d/%d tensors, promoted %d to F32\n",
n_quantized, n_tensors, n_promoted);
fprintf(stderr, "[Quantize] %.1f GB -> %.1f GB (%.1fx)\n",
(double)bytes_in / 1e9, (double)bytes_out / 1e9,
bytes_out > 0 ? (double)bytes_in / (double)bytes_out : 0.0);
fprintf(stderr, "[Quantize] Wrote %s\n", out_path);
gguf_free(out);
gguf_free(inp);
ggml_free(meta);
#ifdef _WIN32
UnmapViewOfFile(mapping);
CloseHandle(mh);
CloseHandle(fh);
#else
munmap(mapping, file_size);
close(fd);
#endif
return 0;
}