diff --git a/README.md b/README.md
index f3c6d5f5c..4473949c0 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ KoboldCpp can now also be run on Novita AI, a newer alternative GPU cloud provid
## Obtaining a GGUF model
- KoboldCpp uses GGUF models. They are not included with KoboldCpp, but you can download GGUF files from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). Search for "GGUF" on huggingface.co for plenty of compatible models in the `.gguf` format.
-- For beginners, we recommend the models [Airoboros Mistral](https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model).
+- For beginners, we recommend the models [BookAdventures 8B](https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model).
- [Alternatively, you can download the tools to convert models to the GGUF format yourself here](https://kcpptools.concedo.workers.dev). Run `convert-hf-to-gguf.py` to convert them, then `quantize_gguf.exe` to quantize the result.
- Other models for Whisper (speech recognition), Image Generation or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)
diff --git a/colab.ipynb b/colab.ipynb
index 09eac333b..6584d0bf7 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
"source": [
"#@title v-- Enter your model below and then click this to start Koboldcpp\n",
"\n",
- "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\"]{allow-input: true}\n",
+ "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\"]{allow-input: true}\n",
"Layers = 99 #@param [99]{allow-input: true}\n",
"ContextSize = 4096 #@param [4096,8192] {allow-input: true}\n",
"FlashAttention = True #@param {type:\"boolean\"}\n",
diff --git a/common/common.cpp b/common/common.cpp
index 76e84709e..0c1963c07 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -848,7 +848,7 @@ struct common_init_result common_init_from_params(common_params & params) {
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
- model = llama_load_model_from_file(params.model.c_str(), mparams);
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
}
if (model == NULL) {
@@ -875,7 +875,7 @@ struct common_init_result common_init_from_params(common_params & params) {
}
if (!ok) {
- llama_free_model(model);
+ llama_model_free(model);
return iparams;
}
@@ -886,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
- llama_free_model(model);
+ llama_model_free(model);
return iparams;
}
@@ -902,7 +902,7 @@ struct common_init_result common_init_from_params(common_params & params) {
const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);
- llama_free_model(model);
+ llama_model_free(model);
return iparams;
}
@@ -915,7 +915,7 @@ struct common_init_result common_init_from_params(common_params & params) {
params.control_vector_layer_end);
if (err) {
llama_free(lctx);
- llama_free_model(model);
+ llama_model_free(model);
return iparams;
}
@@ -928,7 +928,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
- llama_free_model(model);
+ llama_model_free(model);
return iparams;
}
@@ -984,7 +984,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
- if (decoder_start_token_id == -1) {
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = bos;
}
tmp.clear();
@@ -1413,7 +1413,7 @@ struct llama_model * common_load_model_from_url(
}
}
- return llama_load_model_from_file(local_path.c_str(), params);
+ return llama_model_load_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index a9dfb6714..a057ae45f 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) {
- return -1;
+ return LLAMA_TOKEN_NULL;
}
const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0;
int sum_count_static = 0;
- llama_token max_token = -1;
+ llama_token max_token = LLAMA_TOKEN_NULL;
for (std::pair token_count_static : part_static) {
const llama_token token = token_count_static.first;
@@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
}
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
- return -1;
+ return LLAMA_TOKEN_NULL;
}
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
- return -1;
+ return LLAMA_TOKEN_NULL;
}
return max_token;
}
@@ -98,9 +98,9 @@ static llama_token try_draft(
common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) {
- llama_token drafted_token = -1;
+ llama_token drafted_token = LLAMA_TOKEN_NULL;
- for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+ for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
const common_ngram ngram_primary = ngrams_primary[i];
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@@ -112,7 +112,7 @@ static llama_token try_draft(
int max_count_primary = 0;
int max_count_static = 0;
int sum_count_primary = 0;
- llama_token max_token = -1;
+ llama_token max_token = LLAMA_TOKEN_NULL;
for (std::pair token_count_primary : part_primary) {
const llama_token token = token_count_primary.first;
@@ -154,7 +154,7 @@ void common_ngram_cache_draft(
}
while ((int) draft.size()-1 < n_draft) {
- llama_token drafted_token = -1;
+ llama_token drafted_token = LLAMA_TOKEN_NULL;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
common_ngram ngram_static;
@@ -177,17 +177,17 @@ void common_ngram_cache_draft(
}
ngrams_cd.push_back(ngram_cd);
}
- if (drafted_token == -1) {
+ if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
}
- if (drafted_token == -1) {
+ if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
}
- if (drafted_token == -1) {
+ if (drafted_token == LLAMA_TOKEN_NULL) {
drafted_token = try_draft(nc_static, ngram_static);
}
- if (drafted_token == -1) {
+ if (drafted_token == LLAMA_TOKEN_NULL) {
break;
}
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
index 09c2b0319..dfe012abe 100644
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -17,13 +17,13 @@ struct common_ngram {
common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
- tokens[i] = -1;
+ tokens[i] = LLAMA_TOKEN_NULL;
}
}
common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
- tokens[i] = i < ngram_size ? input[i] : -1;
+ tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
}
}
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4e6c0f60c..01b58f976 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -687,6 +687,9 @@ class Model:
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
res = "megrez"
+ if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
+ # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
+ res = "deepseek-v3"
if res is None:
logger.warning("\n")
@@ -3373,6 +3376,24 @@ class CommandR2Model(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+@Model.register("Cohere2ForCausalLM")
+class Cohere2Model(Model):
+ model_arch = gguf.MODEL_ARCH.COHERE2
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+
+ rotary_pct = self.hparams["rotary_pct"]
+ hidden_size = self.hparams["hidden_size"]
+ num_attention_heads = self.hparams["num_attention_heads"]
+ self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
@Model.register("OlmoForCausalLM")
@Model.register("OLMoForCausalLM")
class OlmoModel(Model):
@@ -3831,6 +3852,7 @@ class DeepseekModel(Model):
@Model.register("DeepseekV2ForCausalLM")
+@Model.register("DeepseekV3ForCausalLM")
class DeepseekV2Model(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -3852,6 +3874,15 @@ class DeepseekV2Model(Model):
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+ if hparams["scoring_func"] == "sigmoid":
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+ elif hparams["scoring_func"] == "softmax":
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+ else:
+ raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
+
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
@@ -3864,6 +3895,16 @@ class DeepseekV2Model(Model):
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # rename e_score_correction_bias tensors
+ if name.endswith("e_score_correction_bias"):
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+
+ # skip Multi-Token Prediction (MTP) layers
+ block_count = self.hparams["num_hidden_layers"]
+ match = re.match(r"model.layers.(\d+)", name)
+ if match and int(match.group(1)) >= block_count:
+ return []
+
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index fea23ddb4..56edc64a7 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -107,6 +107,7 @@ models = [
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
+ {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
]
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 2691c6e6b..27215a42e 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -221,7 +221,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -265,7 +265,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}
llama_free(ctx_llava->ctx_llama);
- llama_free_model(ctx_llava->model);
+ llama_model_free(ctx_llava->model);
llama_backend_free();
}
@@ -323,7 +323,7 @@ int main(int argc, char ** argv) {
}
}
- llama_free_model(model);
+ llama_model_free(model);
return 0;
}
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index e9cbb51ed..2342bdd09 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}
llama_free(ctx_llava->ctx_llama);
- llama_free_model(ctx_llava->model);
+ llama_model_free(ctx_llava->model);
llama_backend_free();
}
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index e86a60280..f3e5d66e2 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -310,7 +310,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
- llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+ llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -354,7 +354,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}
llama_free(ctx_llava->ctx_llama);
- llama_free_model(ctx_llava->model);
+ llama_model_free(ctx_llava->model);
llama_backend_free();
}
@@ -575,7 +575,7 @@ int main(int argc, char ** argv) {
}
}
- llama_free_model(model);
+ llama_model_free(model);
return 0;
}
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c79db25dc..c20dec7e2 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -495,7 +495,7 @@ int main(int argc, char ** argv) {
}
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
- if (decoder_start_token_id == -1) {
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(model);
}
@@ -832,7 +832,7 @@ int main(int argc, char ** argv) {
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
llama_token eot = llama_token_eot(model);
- embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
+ embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
need_insert_eot = false;
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c2e62ba69..127323e77 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3797,7 +3797,7 @@ int main(int argc, char ** argv) {
data["input_extra"] = input_extra; // default to empty array if it's not exist
std::string prompt = json_value(data, "prompt", std::string());
- std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
+ std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
data["prompt"] = format_infill(
ctx_server.ctx,
diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py
index ad4b8192a..10554db0f 100644
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
"input_suffix": "}\n",
})
assert res.status_code == 200
- assert match_regex("(Ann|small|shiny)+", res.body["content"])
+ assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
def test_infill_with_input_extra():
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index dc6e6e67e..ad130d490 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -507,7 +507,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
// format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
- std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+ std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 8f3d222cd..1030ebb7f 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -770,7 +770,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
- if (src_backend_id == sched->n_backends - 1) {
+ if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
@@ -801,9 +801,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
- GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+ if (j == 0) {
+ GGML_LOG_DEBUG(": ");
+ }
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index ce30e299b..a37d983b4 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -4231,6 +4231,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g
buffer->buft = buft;
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
+ buffer->iface.get_tensor = nullptr;
+ buffer->iface.cpy_tensor = nullptr;
return buffer;
}
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 3896f956d..5b0dfacef 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -680,6 +680,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_F16:
return convert_unary_cuda;
+ case GGML_TYPE_BF16:
+ return convert_unary_cuda;
default:
return nullptr;
}
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d1926961a..e2a110582 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1729,7 +1729,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
- bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
@@ -2874,6 +2874,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
+ case GGML_TYPE_BF16:
#ifdef GGML_USE_MUSA
if (a->type == GGML_TYPE_Q3_K) {
return false;
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
index a4b4f6bc1..ac45f2d17 100644
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -1,9 +1,9 @@
#include "common.cuh"
#include "mmv.cuh"
-template
+template
static __global__ void mul_mat_vec(
- const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
+ const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
const int64_t row = blockIdx.x;
const int64_t channel = blockIdx.z;
@@ -13,7 +13,6 @@ static __global__ void mul_mat_vec(
y += channel *stride_channel_y;
dst += channel *stride_channel_dst;
- const half2 * x2 = (const half2 *) x;
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
@@ -28,28 +27,44 @@ static __global__ void mul_mat_vec(
float sumf;
- if (std::is_same::value) {
+ if constexpr (std::is_same::value) {
+ const half2 * x2 = (const half2 *) x;
+
+ if (std::is_same::value) {
+ sumf = 0.0f;
+
+ for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+ const float2 tmpx = __half22float2(x2[col2]);
+ const float2 tmpy = y2[col2];
+ sumf += tmpx.x * tmpy.x;
+ sumf += tmpx.y * tmpy.y;
+ }
+ } else {
+#ifdef FP16_AVAILABLE
+ half2 sumh2 = make_half2(0.0f, 0.0f);
+
+ for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+ const float2 tmp = y2[col2];
+ sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
+ }
+
+ sumf = __low2float(sumh2) + __high2float(sumh2);
+#else
+ NO_DEVICE_CODE;
+#endif // FP16_AVAILABLE
+ }
+ } else if constexpr (std::is_same::value) {
+ const int * x2 = (const int *) x;
sumf = 0.0f;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
- const float2 tmpx = __half22float2(x2[col2]);
+ const int tmpx = x2[col2];
const float2 tmpy = y2[col2];
- sumf += tmpx.x * tmpy.x;
- sumf += tmpx.y * tmpy.y;
+ sumf += float(reinterpret_cast(&tmpx)[0]) * tmpy.x;
+ sumf += float(reinterpret_cast(&tmpx)[1]) * tmpy.y;
}
} else {
-#ifdef FP16_AVAILABLE
- half2 sumh2 = make_half2(0.0f, 0.0f);
-
- for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
- const float2 tmp = y2[col2];
- sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
- }
-
- sumf = __low2float(sumh2) + __high2float(sumh2);
-#else
- NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
+ static_assert(std::is_same::value, "unsupported type");
}
sumf = warp_reduce_sum(sumf);
@@ -71,9 +86,9 @@ static __global__ void mul_mat_vec(
dst[row] = sumf;
}
-template
+template
static void launch_mul_mat_vec_cuda(
- const half * x, const float * y, float * dst,
+ const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
cudaStream_t stream) {
@@ -97,35 +112,35 @@ static void launch_mul_mat_vec_cuda(
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 64: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 96: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 128: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 160: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 192: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 224: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
case 256: {
- mul_mat_vec<<>>
+ mul_mat_vec<<>>
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
} break;
default: {
@@ -134,25 +149,25 @@ static void launch_mul_mat_vec_cuda(
}
}
+template
static void mul_mat_vec_cuda(
- const half * x, const float * y, float * dst,
+ const T * x, const float * y, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
enum ggml_prec prec, cudaStream_t stream) {
switch (prec) {
case GGML_PREC_DEFAULT: {
- launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+ launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
} break;
case GGML_PREC_F32: {
- launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+ launch_mul_mat_vec_cuda(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
} break;
}
}
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -164,7 +179,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
- const half * src0_d = (const half *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
@@ -181,7 +195,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
- mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+ switch (src0->type) {
+ case GGML_TYPE_F16: {
+ const half * src0_d = (const half *) src0->data;
+ mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
+ channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+ } break;
+ case GGML_TYPE_BF16: {
+ const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
+ mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
+ channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+ } break;
+ default:
+ GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+ }
}
void ggml_cuda_op_mul_mat_vec(
@@ -190,7 +217,6 @@ void ggml_cuda_op_mul_mat_vec(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -211,8 +237,20 @@ void ggml_cuda_op_mul_mat_vec(
const int64_t channel_stride_y = 0;
const int64_t channel_stride_dst = 0;
- mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
- nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+ switch (src0->type) {
+ case GGML_TYPE_F16: {
+ const half * src0_d = (const half *) src0_dd_i;
+ mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+ nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+ } break;
+ case GGML_TYPE_BF16: {
+ const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
+ mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+ nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+ } break;
+ default:
+ GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+ }
GGML_UNUSED(ctx);
GGML_UNUSED(src1);
diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h
index db9f6a165..1746b0732 100644
--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
#include
#if CUDART_VERSION < 11020
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 3205534d6..c905b15d7 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
@@ -121,6 +122,8 @@
#define __has_builtin(x) 0
#endif
+typedef hip_bfloat16 nv_bfloat16;
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 1604b8229..6cc1b69ee 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
#include
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
@@ -132,3 +133,5 @@
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture
+
+typedef mt_bfloat16 nv_bfloat16;
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 431082426..63da2b86b 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -27,15 +27,6 @@
#endif
#include
-#define UNUSED GGML_UNUSED
-
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
#ifdef _WIN32
typedef SOCKET sockfd_t;
using ssize_t = __int64;
@@ -93,9 +84,23 @@ enum rpc_cmd {
RPC_CMD_COPY_TENSOR,
RPC_CMD_GRAPH_COMPUTE,
RPC_CMD_GET_DEVICE_MEMORY,
+ RPC_CMD_INIT_TENSOR,
+ RPC_CMD_GET_ALLOC_SIZE,
RPC_CMD_COUNT,
};
+struct rpc_msg_get_alloc_size_req {
+ rpc_tensor tensor;
+};
+
+struct rpc_msg_get_alloc_size_rsp {
+ uint64_t alloc_size;
+};
+
+struct rpc_msg_init_tensor_req {
+ rpc_tensor tensor;
+};
+
struct rpc_msg_alloc_buffer_req {
uint64_t size;
};
@@ -397,7 +402,7 @@ static std::shared_ptr get_socket(const std::string & endpoint) {
initialized = true;
}
#else
- UNUSED(initialized);
+ GGML_UNUSED(initialized);
#endif
auto sock = socket_connect(host.c_str(), port);
if (sock == nullptr) {
@@ -461,10 +466,18 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
}
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
- UNUSED(buffer);
- if (ggml_is_quantized(tensor->type)) {
- // TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
- GGML_ASSERT(tensor->ne[0] % 512 == 0 && "unsupported quantized tensor");
+ ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+
+ // CUDA backend on the server pads everything to 512 due to CUDA limitations.
+ // Due to bandwidth constraints, we only call the server init tensor functions if necessary.
+ // In particular, only quantized tensors need padding
+ if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
+ rpc_msg_init_tensor_req request;
+
+ request.tensor = serialize_tensor(tensor);
+
+ bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
+ GGML_ASSERT(status);
}
}
@@ -577,8 +590,23 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
}
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
- UNUSED(buft);
- return ggml_nbytes(tensor);
+ // See comments in init_tensor.
+ if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+ auto sock = get_socket(buft_ctx->endpoint);
+
+ rpc_msg_get_alloc_size_req request;
+
+ request.tensor = serialize_tensor(tensor);
+
+ rpc_msg_get_alloc_size_rsp response;
+ bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
+ GGML_ASSERT(status);
+
+ return response.alloc_size;
+ } else {
+ return ggml_nbytes(tensor);
+ }
}
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
@@ -603,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
}
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
- UNUSED(backend);
+ GGML_UNUSED(backend);
// this is no-op because we don't have any async operations
}
@@ -757,6 +785,8 @@ public:
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response);
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
bool graph_compute(const std::vector & input, rpc_msg_graph_compute_rsp & response);
+ bool init_tensor(const rpc_msg_init_tensor_req & request);
+ bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
private:
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@@ -770,6 +800,36 @@ private:
std::unordered_set buffers;
};
+bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
+ ggml_backend_buffer_type_t buft;
+ struct ggml_init_params params {
+ /*.mem_size =*/ ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+
+ if (tensor == nullptr) {
+ GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
+ ggml_free(ctx);
+ return false;
+ }
+
+ if (tensor->buffer == nullptr) {
+ //No buffer allocated.
+ buft = ggml_backend_get_default_buffer_type(backend);
+ } else {
+ buft = tensor->buffer->buft;
+ }
+
+ response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
+
+ ggml_free(ctx);
+ return true;
+}
+
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
@@ -781,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
buffers.insert(buffer);
} else {
- GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
+ GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
}
}
@@ -803,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
return false;
}
void * base = ggml_backend_buffer_get_base(buffer);
@@ -815,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
return false;
}
ggml_backend_buffer_free(buffer);
@@ -827,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
ggml_backend_buffer_t buffer = reinterpret_cast(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
- GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+ GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
return false;
}
ggml_backend_buffer_clear(buffer, request.value);
@@ -883,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector & input) {
struct ggml_context * ctx = ggml_init(params);
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
if (tensor == nullptr) {
- GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
+ GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
ggml_free(ctx);
return false;
}
@@ -905,6 +965,40 @@ bool rpc_server::set_tensor(const std::vector & input) {
return true;
}
+bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
+ struct ggml_init_params params {
+ /*.mem_size =*/ ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ struct ggml_context * ctx = ggml_init(params);
+ ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
+ if (tensor == nullptr) {
+ GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
+ ggml_free(ctx);
+ return false;
+ }
+
+ // Call the backend's buffer_init_tensor function
+ ggml_backend_buffer_t buffer = tensor->buffer;
+ if (buffer && buffer->iface.init_tensor) {
+ buffer->iface.init_tensor(buffer, tensor);
+ } else {
+ GGML_LOG_ERROR("Null buffer for tensor passed to init_tensor function\n");
+ }
+
+ if (tensor->extra != nullptr) {
+ // This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
+ // Currently unimplemented.
+ GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
+ ggml_free(ctx);
+ return false;
+ }
+
+ ggml_free(ctx);
+ return true;
+}
+
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector & response) {
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(),
@@ -914,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
struct ggml_context * ctx = ggml_init(params);
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
if (tensor == nullptr) {
- GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
+ GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
ggml_free(ctx);
return false;
}
@@ -948,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
if (src == nullptr || dst == nullptr) {
- GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
+ GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
ggml_free(ctx);
return false;
}
@@ -1058,6 +1152,18 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
}
break;
}
+ case RPC_CMD_GET_ALLOC_SIZE: {
+ rpc_msg_get_alloc_size_req request;
+ if (!recv_msg(sockfd, &request, sizeof(request))) {
+ return;
+ }
+ rpc_msg_get_alloc_size_rsp response;
+ server.get_alloc_size(request, response);
+ if (!send_msg(sockfd, &response, sizeof(response))) {
+ return;
+ }
+ break;
+ }
case RPC_CMD_GET_ALIGNMENT: {
if (!recv_msg(sockfd, nullptr, 0)) {
return;
@@ -1133,6 +1239,19 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
}
break;
}
+ case RPC_CMD_INIT_TENSOR: {
+ rpc_msg_init_tensor_req request;
+ if (!recv_msg(sockfd, &request,sizeof(request))) {
+ return;
+ }
+ if (!server.init_tensor(request)) {
+ return;
+ }
+ if (!send_msg(sockfd, nullptr, 0)) {
+ return;
+ }
+ break;
+ }
case RPC_CMD_GET_TENSOR: {
rpc_msg_get_tensor_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
@@ -1257,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
- UNUSED(dev);
+ GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
// TODO: obtain value from the server
return GGML_BACKEND_DEVICE_TYPE_GPU;
- UNUSED(dev);
+ GGML_UNUSED(dev);
}
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@@ -1285,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const
return ggml_backend_rpc_init(ctx->endpoint.c_str());
- UNUSED(params);
+ GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -1293,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
- UNUSED(dev);
+ GGML_UNUSED(dev);
}
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
- UNUSED(dev);
- UNUSED(op);
+ GGML_UNUSED(dev);
+ GGML_UNUSED(op);
//TODO: call the remote backend and cache the results
return true;
}
@@ -1335,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
return "RPC";
- UNUSED(reg);
+ GGML_UNUSED(reg);
}
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
return 0;
- UNUSED(reg);
+ GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
- UNUSED(reg);
- UNUSED(index);
+ GGML_UNUSED(reg);
+ GGML_UNUSED(index);
}
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
@@ -1357,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
}
return NULL;
- UNUSED(reg);
+ GGML_UNUSED(reg);
}
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp
index 75ddfb86a..105db6f03 100644
--- a/ggml/src/ggml-sycl/wkv6.cpp
+++ b/ggml/src/ggml-sycl/wkv6.cpp
@@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
[=](sycl::nd_item<3> item_ct1) {
rwkv_wkv_f32_kernel(
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
- item_ct1, shared_mem_acc.get_pointer()
+ item_ct1, (float*)shared_mem_acc.get_multi_ptr().get()
);
});
});
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 749b9d7d9..c02f0a3e8 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2040,6 +2040,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
std::cerr << "Done!" << std::endl;
}
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
+
static vk_device ggml_vk_get_device(size_t idx) {
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
@@ -2175,9 +2177,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
- if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
- // Intel drivers don't support coopmat properly yet
- // Only RADV supports coopmat properly on AMD
+ if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
device->coopmat_support = false;
}
@@ -2515,7 +2515,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
return vk_instance.devices[idx];
}
-
static void ggml_vk_print_gpu_info(size_t idx) {
GGML_ASSERT(idx < vk_instance.device_indices.size());
size_t dev_num = vk_instance.device_indices[idx];
@@ -2565,9 +2564,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
}
}
- if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
- // Intel drivers don't support coopmat properly yet
- // Only RADV supports coopmat properly on AMD
+ if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
coopmat_support = false;
}
@@ -8088,6 +8085,25 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
UNUSED(instance_extensions);
}
+static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
+ switch (props.vendorID) {
+ case VK_VENDOR_ID_INTEL:
+ // Intel drivers don't support coopmat properly yet
+ return false;
+ case VK_VENDOR_ID_AMD:
+ if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
+ // Workaround for AMD proprietary driver reporting support on all GPUs
+ const std::string name = props.deviceName;
+ return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
+ name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
+ name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
+ }
+ return true;
+ default:
+ return true;
+ }
+}
+
// checks
#ifdef GGML_VULKAN_CHECK_RESULTS
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 273370370..9d0e7489f 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -102,6 +102,8 @@ class Keys:
EXPERT_USED_COUNT = "{arch}.expert_used_count"
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
+ EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
+ EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -255,6 +257,7 @@ class MODEL_ARCH(IntEnum):
MAMBA = auto()
XVERSE = auto()
COMMAND_R = auto()
+ COHERE2 = auto()
DBRX = auto()
OLMO = auto()
OLMO2 = auto()
@@ -312,6 +315,7 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto()
+ FFN_EXP_PROBS_B = auto()
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
@@ -437,6 +441,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
+ MODEL_ARCH.COHERE2: "cohere2",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2",
@@ -496,6 +501,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
+ MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
@@ -1136,6 +1142,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_Q_NORM,
],
+ MODEL_ARCH.COHERE2: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
MODEL_ARCH.DBRX: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -1276,6 +1294,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
],
MODEL_ARCH.CHATGLM : [
MODEL_TENSOR.TOKEN_EMBD,
@@ -1576,6 +1595,11 @@ class GGMLQuantizationType(IntEnum):
TQ2_0 = 35
+class ExpertGatingFuncType(IntEnum):
+ SOFTMAX = 1
+ SIGMOID = 2
+
+
# TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 3023b539a..4a0a65e3c 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -26,6 +26,7 @@ from .constants import (
RopeScalingType,
PoolingType,
TokenType,
+ ExpertGatingFuncType,
)
from .quants import quant_shape_from_byte_shape
@@ -715,6 +716,12 @@ class GGUFWriter:
def add_expert_weights_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
+ def add_expert_weights_norm(self, value: bool) -> None:
+ self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
+
+ def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
+ self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
+
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 7009a11d4..efe2a4aa4 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -276,6 +276,10 @@ class TensorNameMap:
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
),
+ MODEL_TENSOR.FFN_EXP_PROBS_B: (
+ "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
+ ),
+
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
index 1500cb2fc..11306b17f 100644
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -9,7 +9,7 @@
#include "llama.h"
struct llama_model_deleter {
- void operator()(llama_model * model) { llama_free_model(model); }
+ void operator()(llama_model * model) { llama_model_free(model); }
};
struct llama_context_deleter {
diff --git a/include/llama.h b/include/llama.h
index 7b305b299..0295a51fb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -34,7 +34,6 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-// TODO: use everywhere in the implementation
#define LLAMA_TOKEN_NULL -1
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
@@ -105,6 +104,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
};
enum llama_rope_type {
@@ -413,12 +413,19 @@ extern "C" {
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
- LLAMA_API struct llama_model * llama_load_model_from_file(
+ DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+ const char * path_model,
+ struct llama_model_params params),
+ "use llama_model_load_from_file instead");
+
+ LLAMA_API struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params);
- // TODO: rename to llama_model_free
- LLAMA_API void llama_free_model(struct llama_model * model);
+ DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+ "use llama_model_free instead");
+
+ LLAMA_API void llama_model_free(struct llama_model * model);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model(
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a60038385..007d79f82 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -39,6 +39,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" },
+ { LLM_ARCH_COHERE2, "cohere2" },
{ LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMO2, "olmo2" },
@@ -91,6 +92,8 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
+ { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
+ { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -807,6 +810,21 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
},
},
+ {
+ LLM_ARCH_COHERE2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_DBRX,
{
@@ -968,6 +986,7 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
},
},
{
@@ -1350,6 +1369,7 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+ {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 446e72eeb..45e458bb9 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -43,6 +43,7 @@ enum llm_arch {
LLM_ARCH_MAMBA,
LLM_ARCH_XVERSE,
LLM_ARCH_COMMAND_R,
+ LLM_ARCH_COHERE2,
LLM_ARCH_DBRX,
LLM_ARCH_OLMO,
LLM_ARCH_OLMO2,
@@ -95,6 +96,8 @@ enum llm_kv {
LLM_KV_EXPERT_USED_COUNT,
LLM_KV_EXPERT_SHARED_COUNT,
LLM_KV_EXPERT_WEIGHTS_SCALE,
+ LLM_KV_EXPERT_WEIGHTS_NORM,
+ LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
@@ -230,6 +233,7 @@ enum llm_tensor {
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_GATE_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index a07e9cf00..44670d3d8 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -45,6 +45,7 @@ static const std::map LLM_CHAT_TEMPLATES = {
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
+ { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
@@ -148,6 +149,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_MINICPM;
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
+ } else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
@@ -453,6 +456,21 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "Assistant:";
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
+ // DeepSeek-V3
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "system") {
+ ss << message->content << "\n\n";
+ } else if (role == "user") {
+ ss << LU8("<|User|>") << message->content;
+ } else if (role == "assistant") {
+ ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
+ }
+ }
+ if (add_ass) {
+ ss << LU8("<|Assistant|>");
+ }
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
diff --git a/src/llama-chat.h b/src/llama-chat.h
index 364318c27..b8e94d9ef 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -25,6 +25,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
LLM_CHAT_TEMPLATE_DEEPSEEK,
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
+ LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 3a76b71a4..a29f20ec4 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -6,7 +6,13 @@
// bump if necessary
#define LLAMA_MAX_LAYERS 512
-#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
+#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
+
+enum llama_expert_gating_func_type {
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
+};
struct llama_hparams_posnet {
uint32_t n_embd;
@@ -54,7 +60,9 @@ struct llama_hparams {
uint32_t n_expert_shared = 0;
uint32_t n_norm_groups = 0;
- float expert_weights_scale = 0.0;
+ float expert_weights_scale = 0.0;
+ bool expert_weights_norm = false;
+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
float f_norm_eps;
float f_norm_rms_eps;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 53379253a..90b6c56ed 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -119,10 +119,10 @@ bool llama_kv_cache_init(
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
struct llama_kv_cache & cache,
- const struct llama_ubatch & batch) {
- const uint32_t n_tokens = batch.n_tokens;
- const uint32_t n_seqs = batch.n_seqs;
- const uint32_t n_seq_tokens = batch.n_seq_tokens;
+ const struct llama_ubatch & ubatch) {
+ const uint32_t n_tokens = ubatch.n_tokens;
+ const uint32_t n_seqs = ubatch.n_seqs;
+ const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
if (cache.recurrent) {
// For recurrent state architectures (like Mamba or RWKV),
@@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// A slot should be always be contiguous.
// can only process batches with an equal number of new tokens in each sequence
- GGML_ASSERT(batch.equal_seqs);
+ GGML_ASSERT(ubatch.equal_seqs);
int32_t min = cache.size - 1;
int32_t max = 0;
// everything should fit if all seq_ids are smaller than the max
for (uint32_t s = 0; s < n_seqs; ++s) {
- const uint32_t n_seq_id = batch.n_seq_id[s];
+ const uint32_t n_seq_id = ubatch.n_seq_id[s];
for (uint32_t j = 0; j < n_seq_id; ++j) {
- const llama_seq_id seq_id = batch.seq_id[s][j];
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
// too big seq_id
@@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// find usable cell range
for (uint32_t s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
llama_kv_cell & seq_meta = cache.cells[seq_id];
bool has_cell = false;
if (seq_meta.tail >= 0) {
@@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// gather and re-order
for (uint32_t s = 0; s < n_seqs; ++s) {
int32_t dst_id = s + min;
- int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
+ int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
if (dst_id != src_id) {
llama_kv_cell & dst_cell = cache.cells[dst_id];
llama_kv_cell & src_cell = cache.cells[src_id];
@@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// update the pos of the used seqs
for (uint32_t s = 0; s < n_seqs; ++s) {
- const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+ const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
int32_t cell_id = s + min;
llama_kv_cell & cell = cache.cells[cell_id];
@@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
// What should happen when the pos backtracks or skips a value?
// Clearing the state mid-batch would require special-casing which isn't done.
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
- __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
+ __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
}
cell.pos = last_pos;
cell.seq_id.clear();
- for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
- const llama_seq_id seq_id = batch.seq_id[s][j];
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+ const llama_seq_id seq_id = ubatch.seq_id[s][j];
cell.seq_id.insert(seq_id);
cache.cells[seq_id].tail = cell_id;
}
@@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
for (uint32_t s = 0; s < n_seqs; s++) {
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
uint32_t k = s*n_seq_tokens + i;
- cache.cells[cache.head + k].pos = batch.pos[k];
+ cache.cells[cache.head + k].pos = ubatch.pos[k];
- for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
- cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
+ for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
+ cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
}
}
}
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 0487d0836..9ffa04709 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -241,12 +241,16 @@ llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }
-int llama_file::fileno() const {
+int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
+#else
+#if defined(fileno)
+ return fileno(pimpl->fp);
#else
return ::fileno(pimpl->fp);
#endif
+#endif
}
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
@@ -265,7 +269,7 @@ struct llama_mmap::impl {
impl(struct llama_file * file, size_t prefetch, bool numa) {
size = file->size();
- int fd = file->fileno();
+ int fd = file->file_id();
int flags = MAP_SHARED;
if (numa) { prefetch = 0; }
#ifdef __linux__
@@ -357,7 +361,7 @@ struct llama_mmap::impl {
size = file->size();
- HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno());
+ HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 6bcddee8c..1da9ecb6b 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -18,7 +18,7 @@ struct llama_file {
size_t tell() const;
size_t size() const;
- int fileno() const;
+ int file_id() const; // fileno overload
void seek(size_t offset, int whence) const;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index af70705f6..9fc78756b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -70,6 +70,7 @@ const char * llm_type_name(llm_type type) {
case MODEL_70B: return "70B";
case MODEL_236B: return "236B";
case MODEL_314B: return "314B";
+ case MODEL_671B: return "671B";
case MODEL_SMALL: return "0.1B";
case MODEL_MEDIUM: return "0.4B";
case MODEL_LARGE: return "0.8B";
@@ -129,6 +130,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
}
}
+static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
+ switch (type) {
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
+ default: return "unknown";
+ }
+}
+
std::string llama_model_arch_name (const llama_model & model) {
return llm_arch_name(model.arch);
}
@@ -793,6 +802,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_COHERE2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ switch (hparams.n_layer) {
+ case 32: model.type = e_model::MODEL_8B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_DBRX:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -930,11 +949,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
+ // that have no expert_gating_func model parameter set
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+ }
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
switch (hparams.n_layer) {
case 27: model.type = e_model::MODEL_16B; break;
case 60: model.type = e_model::MODEL_236B; break;
+ case 61: model.type = e_model::MODEL_671B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
@@ -1266,6 +1293,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
tokenizer_pre == "deepseek-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
vocab.tokenizer_clean_spaces = false;
+ } else if (
+ tokenizer_pre == "deepseek-v3") {
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+ vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "falcon") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1912,24 +1943,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
// special tokens
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
- if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
- if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+ if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+ if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+ if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
+ if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
+ if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+ if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+ if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+ if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
+ if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+ if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
- if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
- if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
- if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
- if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
- if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
- if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
+ if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+ if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+ if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+ if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+ if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+ if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
for (const auto & id : vocab.special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -1951,6 +1982,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
}
@@ -1996,6 +2029,10 @@ struct llama_model_params llama_model_default_params() {
}
void llama_free_model(struct llama_model * model) {
+ llama_model_free(model);
+}
+
+void llama_model_free(struct llama_model * model) {
delete model;
}
@@ -2051,6 +2088,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM:
case LLM_ARCH_XVERSE:
case LLM_ARCH_COMMAND_R:
+ case LLM_ARCH_COHERE2:
case LLM_ARCH_OLMO:
case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK:
diff --git a/src/llama-model.h b/src/llama-model.h
index 01c780c41..ce038932d 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -63,6 +63,7 @@ enum llm_type {
MODEL_70B,
MODEL_236B,
MODEL_314B,
+ MODEL_671B,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
@@ -213,6 +214,7 @@ struct llama_layer {
struct ggml_tensor * ffn_down_b = nullptr; // b2
struct ggml_tensor * ffn_up_b = nullptr; // b3
struct ggml_tensor * ffn_act = nullptr;
+ struct ggml_tensor * ffn_exp_probs_b = nullptr;
// mamba proj
struct ggml_tensor * ssm_in = nullptr;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0526f1d1c..e84773868 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) {
}
}
-struct quantize_state_internal {
+struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
@@ -43,13 +43,13 @@ struct quantize_state_internal {
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
- quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+ quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
-static void llama_tensor_dequantize_internal(
+static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector> & output, std::vector & workers,
const size_t nelements, const int nthread
) {
@@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal(
workers.clear();
}
-static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type;
}
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) {
+static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) {
if (nthread < 2) {
// single-thread
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
return new_size;
}
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
@@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llm_load_hparams(ml, model);
llm_load_stats (ml, model);
- struct quantize_state_internal qs(model, params);
+ struct quantize_state_impl qs(model, params);
if (params->only_copy) {
ftype = model.ftype;
@@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
- llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
@@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
- new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}
@@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
const char * fname_out,
const llama_model_quantize_params * params) {
try {
- llama_model_quantize_internal(fname_inp, fname_out, params);
+ llama_model_quantize_impl(fname_inp, fname_out, params);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
return 1;
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 69cea2f14..ef5a576cc 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
for (int i = 0; i < (int)cur_p->size; ++i) {
const float val = cur_p->data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
- ib = std::max(0, std::min(nbuckets-1, ib));
+ ib = std::max(0, std::min(nbuckets - 1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
@@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
for (int i = 0; i < (int)cur_p->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
- *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
+ *bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
- for (int j = nbuckets-1; j > ib; --j) {
+ for (int j = nbuckets - 1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
@@ -1832,7 +1832,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
if (n > 0) {
lt = k;
- rt = k+n-1;
+ rt = k + n - 1;
}
} else {
// If k is inside the current Z-box, consider two cases.
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 99357cb93..b638859ca 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -609,6 +609,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"\\p{N}+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
+ regex_exprs = {
+ "\\p{N}{1,3}",
+ "[一-龥-ゟ゠-ヿ]+",
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = {
"[\r\n]",
@@ -717,7 +724,7 @@ struct llm_tokenizer_bpe_session {
bool append_bos(std::vector & output) const {
if (vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != -1);
+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
return true;
}
@@ -726,7 +733,7 @@ struct llm_tokenizer_bpe_session {
bool append_eos(std::vector & output) const {
if (vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != -1);
+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
return true;
}
@@ -1623,7 +1630,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
- buffer.erase_after(std::next(buffer.begin(), (source-1)));
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
// repeat for the right side
@@ -1637,7 +1644,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
- buffer.erase_after(std::next(buffer.begin(), (source-1)));
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
break;
}
@@ -1675,7 +1682,7 @@ std::vector llama_tokenize_internal(
bool is_prev_special = true; // prefix with space if first token
if (add_special && vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != -1);
+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
is_prev_special = true;
}
@@ -1710,7 +1717,7 @@ std::vector llama_tokenize_internal(
// }
if (add_special && vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != -1);
+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
}
} break;
@@ -1769,7 +1776,7 @@ std::vector llama_tokenize_internal(
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
- GGML_ASSERT(vocab.special_cls_id != -1);
+ GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_cls_id);
}
@@ -1789,14 +1796,14 @@ std::vector llama_tokenize_internal(
}
if (add_special) {
- GGML_ASSERT(vocab.special_sep_id != -1);
+ GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_sep_id);
}
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != -1);
+ GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
}
llm_tokenizer_ugm_session session(vocab);
@@ -1821,7 +1828,7 @@ std::vector llama_tokenize_internal(
//}
if (add_special && vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != -1);
+ GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
}
} break;
@@ -1890,7 +1897,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
}
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
- return token != -1 && vocab.special_eog_ids.count(token) > 0;
+ return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
}
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
@@ -2134,7 +2141,7 @@ int32_t llama_detokenize_impl(
}
if (remove_special && vocab.tokenizer_add_eos) {
- if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
+ if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
n_tokens--;
}
}
diff --git a/src/llama.cpp b/src/llama.cpp
index 959d97f95..151725d34 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -23,12 +23,8 @@
#include
#include
#include
-#include
#include
-#include
-#include
#include
-#include
#include
#include
#include
@@ -36,7 +32,6 @@
#include
#include
#include
-#include
#include