Merge commit '3ca19b0e9f' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	common/CMakeLists.txt
#	common/chat-peg-parser.cpp
#	docs/backend/SYCL.md
#	docs/ops.md
#	docs/ops/SYCL.csv
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/convert.hpp
#	ggml/src/ggml-sycl/element_wise.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/norm.cpp
#	ggml/src/ggml-sycl/rope.cpp
#	ggml/src/ggml-sycl/rope.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
#	scripts/compare-llama-bench.py
#	scripts/sync_vendor.py
#	tests/CMakeLists.txt
#	tools/cli/cli.cpp
This commit is contained in:
Concedo 2026-03-15 11:11:31 +08:00
commit 67c9798d0b
41 changed files with 836 additions and 212 deletions

View file

@ -872,9 +872,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
quantize_state_impl qs(model, params);
// these need to be set to n_layer by default
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
if (params->only_copy) {
ftype = ml.ftype;
}
@ -981,6 +978,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// compute tensor metadata once and cache it
std::vector<tensor_metadata> metadata(tensors.size());
// initialize quantization state before preliminary loop (counters for use_more_bits)
{
for (size_t i = 0; i < tensors.size(); ++i) {
const auto cat = tensor_get_category(tensors[i]->tensor->name);
if (category_is_attn_v(cat)) {
++qs.n_attention_wv;
}
if (cat == tensor_category::OUTPUT) {
qs.has_tied_embeddings = false;
}
metadata[i].category = cat; // save and re-use the category while we're at it
}
// these also need to be set to n_layer by default
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
}
// flag for --dry-run
bool will_require_imatrix = false;
@ -993,16 +1006,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const struct ggml_tensor * tensor = it->tensor;
const std::string name = ggml_get_name(tensor);
metadata[i].category = tensor_get_category(name);
if (category_is_attn_v(metadata[i].category)) {
++qs.n_attention_wv;
}
if (tensor_name_match_output_weight(name.c_str())) {
qs.has_tied_embeddings = false;
}
uint16_t i_split = params->keep_split ? it->idx : 0;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());