mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # README.md # examples/llama-bench/README.md # examples/llama-bench/llama-bench.cpp # examples/llava/CMakeLists.txt # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # tests/test-chat-template.cpp
This commit is contained in:
commit
b2ecfa0f55
26 changed files with 724 additions and 499 deletions
|
@ -674,8 +674,12 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
|
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
|
||||||
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
if (!url.empty()) {
|
||||||
|
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // LLAMA_USE_CURL
|
#endif // LLAMA_USE_CURL
|
||||||
|
|
|
@ -78,7 +78,7 @@ class ModelBase:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||||
|
@ -454,13 +454,6 @@ class ModelBase:
|
||||||
|
|
||||||
|
|
||||||
class TextModel(ModelBase):
|
class TextModel(ModelBase):
|
||||||
@classmethod
|
|
||||||
def __init_subclass__(cls):
|
|
||||||
# can't use an abstract property, because overriding it without type errors
|
|
||||||
# would require using decorated functions instead of simply defining the property
|
|
||||||
if "model_arch" not in cls.__dict__:
|
|
||||||
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
@ -3373,14 +3366,7 @@ class BertModel(TextModel):
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def _xlmroberta_tokenizer_init(self) -> None:
|
||||||
@ModelBase.register("RobertaModel")
|
|
||||||
class RobertaModel(BertModel):
|
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||||
self._position_offset = 1 + pad_token_id
|
self._position_offset = 1 + pad_token_id
|
||||||
|
@ -3389,82 +3375,7 @@ class RobertaModel(BertModel):
|
||||||
else:
|
else:
|
||||||
self._position_offset = None
|
self._position_offset = None
|
||||||
|
|
||||||
def set_vocab(self):
|
def _xlmroberta_set_vocab(self) -> None:
|
||||||
"""Support BPE tokenizers for roberta models"""
|
|
||||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
|
||||||
if bpe_tok_path.exists():
|
|
||||||
self._set_vocab_gpt2()
|
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
# we need this to validate the size of the token_type embeddings
|
|
||||||
# though currently we are passing all zeros to the token_type embeddings
|
|
||||||
# "Sequence A" or "Sequence B"
|
|
||||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
|
||||||
|
|
||||||
else:
|
|
||||||
return super().set_vocab()
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
||||||
# if name starts with "roberta.", remove the prefix
|
|
||||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
|
||||||
if name.startswith("roberta."):
|
|
||||||
name = name[8:]
|
|
||||||
|
|
||||||
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
|
||||||
if name == "embeddings.position_embeddings.weight":
|
|
||||||
if self._position_offset is not None:
|
|
||||||
data_torch = data_torch[self._position_offset:,:]
|
|
||||||
|
|
||||||
return super().modify_tensors(data_torch, name, bid)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("NomicBertModel")
|
|
||||||
class NomicBertModel(BertModel):
|
|
||||||
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
|
||||||
self.hparams["n_ctx"] = 2048
|
|
||||||
|
|
||||||
# SwigLU activation
|
|
||||||
assert self.hparams["activation_function"] == "swiglu"
|
|
||||||
# this doesn't do anything in the HF version
|
|
||||||
assert self.hparams["causal"] is False
|
|
||||||
# no bias tensors
|
|
||||||
assert self.hparams["qkv_proj_bias"] is False
|
|
||||||
assert self.hparams["mlp_fc1_bias"] is False
|
|
||||||
assert self.hparams["mlp_fc2_bias"] is False
|
|
||||||
# norm at end of layer
|
|
||||||
assert self.hparams["prenorm"] is False
|
|
||||||
# standard RoPE
|
|
||||||
assert self.hparams["rotary_emb_fraction"] == 1.0
|
|
||||||
assert self.hparams["rotary_emb_interleaved"] is False
|
|
||||||
assert self.hparams["rotary_emb_scale_base"] is None
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
super().set_gguf_parameters()
|
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
|
||||||
class XLMRobertaModel(BertModel):
|
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
|
||||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
|
||||||
self._position_offset = 1 + pad_token_id
|
|
||||||
if "max_position_embeddings" in self.hparams:
|
|
||||||
self.hparams["max_position_embeddings"] -= self._position_offset
|
|
||||||
else:
|
|
||||||
self._position_offset = None
|
|
||||||
|
|
||||||
def set_vocab(self):
|
|
||||||
# to avoid TypeError: Descriptors cannot be created directly
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
# exception when importing sentencepiece_model_pb2
|
# exception when importing sentencepiece_model_pb2
|
||||||
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
@ -3546,6 +3457,138 @@ class XLMRobertaModel(BertModel):
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("RobertaModel")
|
||||||
|
class RobertaModel(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||||
|
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||||
|
self._position_offset = 1 + pad_token_id
|
||||||
|
if "max_position_embeddings" in self.hparams:
|
||||||
|
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||||
|
else:
|
||||||
|
self._position_offset = None
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
"""Support BPE tokenizers for roberta models"""
|
||||||
|
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||||
|
if bpe_tok_path.exists():
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
# we need this to validate the size of the token_type embeddings
|
||||||
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
|
# "Sequence A" or "Sequence B"
|
||||||
|
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||||
|
|
||||||
|
else:
|
||||||
|
return super().set_vocab()
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# if name starts with "roberta.", remove the prefix
|
||||||
|
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||||
|
if name.startswith("roberta."):
|
||||||
|
name = name[8:]
|
||||||
|
|
||||||
|
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||||
|
if name == "embeddings.position_embeddings.weight":
|
||||||
|
if self._position_offset is not None:
|
||||||
|
data_torch = data_torch[self._position_offset:,:]
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("NomicBertModel")
|
||||||
|
class NomicBertModel(BertModel):
|
||||||
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
|
||||||
|
hparams = kwargs.pop("hparams", None)
|
||||||
|
if hparams is None:
|
||||||
|
hparams = ModelBase.load_hparams(dir_model)
|
||||||
|
|
||||||
|
self.is_moe = bool(hparams.get("moe_every_n_layers"))
|
||||||
|
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
|
||||||
|
|
||||||
|
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
|
||||||
|
|
||||||
|
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
|
||||||
|
if self._tokenizer_is_xlmroberta:
|
||||||
|
self._xlmroberta_tokenizer_init()
|
||||||
|
|
||||||
|
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||||
|
self.hparams["n_ctx"] = 2048
|
||||||
|
|
||||||
|
assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
|
||||||
|
|
||||||
|
# this doesn't do anything in the HF version
|
||||||
|
assert self.hparams["causal"] is False
|
||||||
|
# no bias tensors unless MoE
|
||||||
|
assert self.hparams["qkv_proj_bias"] == self.is_moe
|
||||||
|
assert self.hparams["mlp_fc1_bias"] == self.is_moe
|
||||||
|
assert self.hparams["mlp_fc2_bias"] == self.is_moe
|
||||||
|
|
||||||
|
# norm at end of layer
|
||||||
|
assert self.hparams["prenorm"] is False
|
||||||
|
# standard RoPE
|
||||||
|
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||||
|
assert self.hparams["rotary_emb_interleaved"] is False
|
||||||
|
assert self.hparams["rotary_emb_scale_base"] is None
|
||||||
|
|
||||||
|
def set_vocab(self) -> None:
|
||||||
|
if self._tokenizer_is_xlmroberta:
|
||||||
|
return self._xlmroberta_set_vocab()
|
||||||
|
return super().set_vocab()
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
|
||||||
|
# If the tensor is an experts bias tensor, skip it by returning an empty list.
|
||||||
|
if "mlp.experts.bias" in name:
|
||||||
|
return [] # Explicitly return an empty list.
|
||||||
|
|
||||||
|
if "mlp.experts.mlp.w1" in name:
|
||||||
|
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||||
|
name += ".weight"
|
||||||
|
|
||||||
|
if "mlp.experts.mlp.w2" in name:
|
||||||
|
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||||
|
data_torch = data_torch.transpose(1, 2)
|
||||||
|
name += ".weight"
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
if self.is_moe:
|
||||||
|
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||||
|
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||||
|
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
|
||||||
|
|
||||||
|
def _is_tokenizer_xlmroberta(self) -> bool:
|
||||||
|
with open(self.dir_model / "tokenizer.json") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
toktyp = tokenizer_json["model"]["type"]
|
||||||
|
if toktyp == "Unigram":
|
||||||
|
return True
|
||||||
|
if toktyp == "WordPiece":
|
||||||
|
return False
|
||||||
|
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||||
|
class XLMRobertaModel(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._xlmroberta_tokenizer_init()
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._xlmroberta_set_vocab()
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
# if name starts with "roberta.", remove the prefix
|
# if name starts with "roberta.", remove the prefix
|
||||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||||
|
|
|
@ -184,8 +184,8 @@ struct clip_hparams {
|
||||||
std::vector<int32_t> image_grid_pinpoints;
|
std::vector<int32_t> image_grid_pinpoints;
|
||||||
int32_t image_crop_resolution;
|
int32_t image_crop_resolution;
|
||||||
std::unordered_set<int32_t> vision_feature_layer;
|
std::unordered_set<int32_t> vision_feature_layer;
|
||||||
int32_t attn_window_size;
|
int32_t attn_window_size = 0;
|
||||||
int32_t n_wa_pattern;
|
int32_t n_wa_pattern = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_layer {
|
struct clip_layer {
|
||||||
|
@ -345,7 +345,6 @@ struct clip_ctx {
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
bool use_gelu = false;
|
bool use_gelu = false;
|
||||||
bool use_silu = false;
|
bool use_silu = false;
|
||||||
int32_t ftype = 1;
|
|
||||||
|
|
||||||
gguf_context_ptr ctx_gguf;
|
gguf_context_ptr ctx_gguf;
|
||||||
ggml_context_ptr ctx_data;
|
ggml_context_ptr ctx_data;
|
||||||
|
@ -801,7 +800,6 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||||
const int image_size_width = imgs.entries[0]->nx;
|
const int image_size_width = imgs.entries[0]->nx;
|
||||||
const int image_size_height = imgs.entries[0]->ny;
|
const int image_size_height = imgs.entries[0]->ny;
|
||||||
|
|
||||||
const bool use_mrope = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
|
|
||||||
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
||||||
|
|
||||||
const int n_wa_pattern = hparams.n_wa_pattern;
|
const int n_wa_pattern = hparams.n_wa_pattern;
|
||||||
|
@ -810,10 +808,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||||
const int patches_w = image_size_width / patch_size;
|
const int patches_w = image_size_width / patch_size;
|
||||||
const int patches_h = image_size_height / patch_size;
|
const int patches_h = image_size_height / patch_size;
|
||||||
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
||||||
const int num_position_ids = use_mrope ? num_positions * 4 : num_positions;
|
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
|
||||||
const int hidden_size = hparams.hidden_size;
|
const int hidden_size = hparams.hidden_size;
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
const int d_head = hidden_size / n_head;
|
const int d_head = hidden_size / n_head;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
const float eps = hparams.eps;
|
const float eps = hparams.eps;
|
||||||
|
|
||||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||||
|
@ -895,7 +894,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over layers
|
// loop over layers
|
||||||
for (int il = 0; il < ctx->max_feature_layer; il++) {
|
for (int il = 0; il < n_layer; il++) {
|
||||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||||
|
|
||||||
// rmsnorm1
|
// rmsnorm1
|
||||||
|
@ -1140,15 +1139,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
||||||
int pos_w = image_size_width/patch_size;
|
int pos_w = image_size_width/patch_size;
|
||||||
int pos_h = image_size_height/patch_size;
|
int pos_h = image_size_height/patch_size;
|
||||||
if (ctx->minicpmv_version == 2) {
|
int n_output_dim = clip_n_mmproj_embd(ctx);
|
||||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
|
||||||
}
|
|
||||||
else if (ctx->minicpmv_version == 3) {
|
|
||||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
||||||
}
|
|
||||||
else if (ctx->minicpmv_version == 4) {
|
|
||||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
||||||
}
|
|
||||||
ggml_set_name(pos_embed, "pos_embed");
|
ggml_set_name(pos_embed, "pos_embed");
|
||||||
ggml_set_input(pos_embed);
|
ggml_set_input(pos_embed);
|
||||||
}
|
}
|
||||||
|
@ -1486,23 +1478,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
||||||
}
|
}
|
||||||
|
|
||||||
{ // attention
|
{ // attention
|
||||||
int hidden_size = 4096;
|
int hidden_size = clip_n_mmproj_embd(ctx);
|
||||||
const int d_head = 128;
|
const int d_head = 128;
|
||||||
int n_head = hidden_size/d_head;
|
int n_head = hidden_size/d_head;
|
||||||
int num_query = 96;
|
int num_query = 96;
|
||||||
if (ctx->minicpmv_version == 2) {
|
if (ctx->minicpmv_version == 2) {
|
||||||
hidden_size = 4096;
|
|
||||||
n_head = hidden_size/d_head;
|
|
||||||
num_query = 96;
|
num_query = 96;
|
||||||
}
|
}
|
||||||
else if (ctx->minicpmv_version == 3) {
|
else if (ctx->minicpmv_version == 3) {
|
||||||
hidden_size = 3584;
|
|
||||||
n_head = hidden_size/d_head;
|
|
||||||
num_query = 64;
|
num_query = 64;
|
||||||
}
|
}
|
||||||
else if (ctx->minicpmv_version == 4) {
|
else if (ctx->minicpmv_version == 4) {
|
||||||
hidden_size = 3584;
|
|
||||||
n_head = hidden_size/d_head;
|
|
||||||
num_query = 64;
|
num_query = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1613,7 +1599,7 @@ struct clip_model_loader {
|
||||||
clip_ctx & ctx_clip;
|
clip_ctx & ctx_clip;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
|
|
||||||
size_t model_size; // in bytes
|
size_t model_size = 0; // in bytes
|
||||||
|
|
||||||
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
|
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
|
||||||
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
|
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
|
||||||
|
@ -1810,6 +1796,10 @@ struct clip_model_loader {
|
||||||
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
||||||
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
||||||
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
||||||
|
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
||||||
|
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
||||||
|
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
|
||||||
|
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
|
||||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
@ -2972,15 +2962,18 @@ void clip_free(clip_ctx * ctx) {
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
const int32_t nx = ctx->vision_model.hparams.image_size;
|
||||||
|
const int32_t ny = ctx->vision_model.hparams.image_size;
|
||||||
|
return clip_embd_nbytes_by_img(ctx, nx, ny);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
|
||||||
clip_image_f32 img;
|
clip_image_f32 img;
|
||||||
img.nx = img_w;
|
img.nx = img_w;
|
||||||
img.ny = img_h;
|
img.ny = img_h;
|
||||||
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
|
||||||
|
@ -3010,14 +3003,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
clip_image_f32 img;
|
clip_image_f32 img;
|
||||||
img.nx = ctx->vision_model.hparams.image_size;
|
img.nx = ctx->vision_model.hparams.image_size;
|
||||||
img.ny = ctx->vision_model.hparams.image_size;
|
img.ny = ctx->vision_model.hparams.image_size;
|
||||||
return clip_n_patches_by_img(ctx, &img);
|
return clip_n_output_tokens(ctx, &img);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
|
return clip_n_output_tokens(ctx, img);
|
||||||
|
}
|
||||||
|
|
||||||
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
|
const auto & params = ctx->vision_model.hparams;
|
||||||
|
const int n_total = clip_n_output_tokens(ctx, img);
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||||
|
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
|
||||||
|
}
|
||||||
|
return n_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
|
const auto & params = ctx->vision_model.hparams;
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||||
|
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->vision_model.hparams;
|
const auto & params = ctx->vision_model.hparams;
|
||||||
|
|
||||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||||
|
@ -3179,15 +3195,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
|
||||||
const int pos_w = ctx->load_image_size.width / patch_size;
|
const int pos_w = ctx->load_image_size.width / patch_size;
|
||||||
const int pos_h = ctx->load_image_size.height / patch_size;
|
const int pos_h = ctx->load_image_size.height / patch_size;
|
||||||
|
|
||||||
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
||||||
|
|
||||||
|
auto get_inp_tensor = [&gf](const char * name) {
|
||||||
|
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
|
||||||
|
if (inp == nullptr) {
|
||||||
|
GGML_ABORT("Failed to get tensor %s", name);
|
||||||
|
}
|
||||||
|
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
|
||||||
|
GGML_ABORT("Tensor %s is not an input tensor", name);
|
||||||
|
}
|
||||||
|
return inp;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
|
||||||
|
ggml_tensor * cur = get_inp_tensor(name);
|
||||||
|
GGML_ASSERT(cur->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
||||||
|
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
||||||
|
};
|
||||||
|
|
||||||
|
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
|
||||||
|
ggml_tensor * cur = get_inp_tensor(name);
|
||||||
|
GGML_ASSERT(cur->type == GGML_TYPE_I32);
|
||||||
|
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
|
||||||
|
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
|
||||||
|
};
|
||||||
|
|
||||||
|
// set input pixel values
|
||||||
{
|
{
|
||||||
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
size_t nelem = 0;
|
||||||
std::vector<float> inp_data(ggml_nelements(inp_raw));
|
for (const auto & img : imgs.entries) {
|
||||||
float * data = inp_data.data();
|
nelem += img->nx * img->ny * 3;
|
||||||
|
}
|
||||||
|
std::vector<float> inp_raw(nelem);
|
||||||
|
|
||||||
// layout of data (note: the channel dim is unrolled to better visualize the layout):
|
// layout of data (note: the channel dim is unrolled to better visualize the layout):
|
||||||
//
|
//
|
||||||
|
@ -3206,7 +3250,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
const int n = nx * ny;
|
const int n = nx * ny;
|
||||||
|
|
||||||
for (int b = 0; b < batch_size; b++) {
|
for (int b = 0; b < batch_size; b++) {
|
||||||
float * batch_entry = data + b * (3*n);
|
float * batch_entry = inp_raw.data() + b * (3*n);
|
||||||
for (int y = 0; y < ny; y++) {
|
for (int y = 0; y < ny; y++) {
|
||||||
for (int x = 0; x < nx; x++) {
|
for (int x = 0; x < nx; x++) {
|
||||||
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
size_t base_src = 3*(y * nx + x); // idx of the first channel
|
||||||
|
@ -3218,266 +3262,207 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
set_input_f32("inp_raw", inp_raw);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
|
// set input per projector
|
||||||
{
|
switch (ctx->proj_type) {
|
||||||
// inspired from siglip:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
{
|
||||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
// inspired from siglip:
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||||
std::vector<int> pos_data(ggml_nelements(positions));
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||||
int * data = pos_data.data();
|
std::vector<int32_t> positions(pos_h * pos_w);
|
||||||
int bucket_coords_h[1024];
|
int bucket_coords_h[1024];
|
||||||
int bucket_coords_w[1024];
|
int bucket_coords_w[1024];
|
||||||
for (int i = 0; i < pos_h; i++){
|
for (int i = 0; i < pos_h; i++){
|
||||||
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
||||||
}
|
|
||||||
for (int i = 0; i < pos_w; i++){
|
|
||||||
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
|
||||||
}
|
|
||||||
for (int i = 0, id = 0; i < pos_h; i++){
|
|
||||||
for (int j = 0; j < pos_w; j++){
|
|
||||||
data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
|
||||||
}
|
}
|
||||||
}
|
for (int i = 0; i < pos_w; i++){
|
||||||
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
|
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
// inspired from resampler of Qwen-VL:
|
|
||||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
|
||||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
|
||||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
|
||||||
int embed_dim = 4096;
|
|
||||||
if (ctx->minicpmv_version == 2) {
|
|
||||||
embed_dim = 4096;
|
|
||||||
}
|
|
||||||
else if (ctx->minicpmv_version == 3) {
|
|
||||||
embed_dim = 3584;
|
|
||||||
}
|
|
||||||
else if (ctx->minicpmv_version == 4) {
|
|
||||||
embed_dim = 3584;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
|
||||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
||||||
|
|
||||||
std::vector<float> pos_data(ggml_nelements(pos_embed));
|
|
||||||
float * data = pos_data.data();
|
|
||||||
for(int i = 0; i < pos_w * pos_h; ++i){
|
|
||||||
for(int j = 0; j < embed_dim; ++j){
|
|
||||||
data[i * embed_dim + j] = pos_embed_t[i][j];
|
|
||||||
}
|
}
|
||||||
}
|
for (int i = 0, id = 0; i < pos_h; i++){
|
||||||
|
for (int j = 0; j < pos_w; j++){
|
||||||
|
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
set_input_i32("positions", positions);
|
||||||
|
|
||||||
ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
|
// inspired from resampler of Qwen-VL:
|
||||||
}
|
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||||
}
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
else {
|
int embed_dim = clip_n_mmproj_embd(ctx);
|
||||||
// non-minicpmv models
|
|
||||||
|
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
||||||
// pw * ph = number of tokens output by ViT after apply patch merger
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
// ipw * ipw = number of vision token been processed inside ViT
|
|
||||||
const int merge_ratio = 2;
|
|
||||||
const int pw = image_size_width / patch_size / merge_ratio;
|
|
||||||
const int ph = image_size_height / patch_size / merge_ratio;
|
|
||||||
const int ipw = image_size_width / patch_size;
|
|
||||||
const int iph = image_size_height / patch_size;
|
|
||||||
|
|
||||||
std::vector<int> idx (ph * pw);
|
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
|
||||||
std::vector<int> inv_idx(ph * pw);
|
for(int i = 0; i < pos_w * pos_h; ++i){
|
||||||
|
for(int j = 0; j < embed_dim; ++j){
|
||||||
|
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (use_window_attn) {
|
set_input_f32("pos_embed", pos_embed);
|
||||||
const int attn_window_size = 112;
|
} break;
|
||||||
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
|
{
|
||||||
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
|
const int merge_ratio = 2;
|
||||||
|
const int pw = image_size_width / patch_size;
|
||||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
const int ph = image_size_height / patch_size;
|
||||||
int dst = 0;
|
std::vector<int> positions(num_positions * 4);
|
||||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
int ptr = 0;
|
||||||
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
for (int y = 0; y < ph; y += merge_ratio) {
|
||||||
int mask_row = 0;
|
for (int x = 0; x < pw; x += merge_ratio) {
|
||||||
|
for (int dy = 0; dy < 2; dy++) {
|
||||||
for (int y = 0; y < ph; y += grid_window)
|
for (int dx = 0; dx < 2; dx++) {
|
||||||
{
|
positions[ ptr] = y + dy;
|
||||||
for (int x = 0; x < pw; x += grid_window)
|
positions[ num_patches + ptr] = x + dx;
|
||||||
{
|
positions[2 * num_patches + ptr] = y + dy;
|
||||||
const int win_h = std::min(grid_window, ph - y);
|
positions[3 * num_patches + ptr] = x + dx;
|
||||||
const int win_w = std::min(grid_window, pw - x);
|
ptr++;
|
||||||
const int dst_0 = dst;
|
|
||||||
// group all tokens belong to the same window togather (to a continue range)
|
|
||||||
for (int dy = 0; dy < win_h; dy++) {
|
|
||||||
for (int dx = 0; dx < win_w; dx++) {
|
|
||||||
const int src = (y + dy) * pw + (x + dx);
|
|
||||||
assert(src < (int)idx.size());
|
|
||||||
assert(dst < (int)inv_idx.size());
|
|
||||||
idx [src] = dst;
|
|
||||||
inv_idx[dst] = src;
|
|
||||||
dst++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
|
||||||
int row_offset = mask_row * (ipw * iph);
|
|
||||||
std::fill(
|
|
||||||
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
|
||||||
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
|
||||||
0.0);
|
|
||||||
mask_row++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
|
set_input_i32("positions", positions);
|
||||||
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
|
} break;
|
||||||
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
} else {
|
|
||||||
std::iota(idx.begin(), idx.end(), 0);
|
|
||||||
std::iota(inv_idx.begin(), inv_idx.end(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
||||||
const int mpow = merge_ratio * merge_ratio;
|
|
||||||
std::vector<int> positions_data(ggml_nelements(positions));
|
|
||||||
int * data = positions_data.data();
|
|
||||||
|
|
||||||
int ptr = 0;
|
|
||||||
for (int y = 0; y < iph; y += merge_ratio)
|
|
||||||
{
|
{
|
||||||
for (int x = 0; x < ipw; x += merge_ratio)
|
// pw * ph = number of tokens output by ViT after apply patch merger
|
||||||
{
|
// ipw * ipw = number of vision token been processed inside ViT
|
||||||
for (int dy = 0; dy < 2; dy++) {
|
const int merge_ratio = 2;
|
||||||
for (int dx = 0; dx < 2; dx++) {
|
const int pw = image_size_width / patch_size / merge_ratio;
|
||||||
auto remap = idx[ptr / mpow];
|
const int ph = image_size_height / patch_size / merge_ratio;
|
||||||
remap = remap * mpow + (ptr % mpow);
|
const int ipw = image_size_width / patch_size;
|
||||||
|
const int iph = image_size_height / patch_size;
|
||||||
|
|
||||||
data[ remap] = y + dy;
|
std::vector<int> idx (ph * pw);
|
||||||
data[ num_patches + remap] = x + dx;
|
std::vector<int> inv_idx(ph * pw);
|
||||||
data[2 * num_patches + remap] = y + dy;
|
|
||||||
data[3 * num_patches + remap] = x + dx;
|
if (use_window_attn) {
|
||||||
ptr++;
|
const int attn_window_size = 112;
|
||||||
|
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
||||||
|
int dst = 0;
|
||||||
|
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
||||||
|
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
||||||
|
int mask_row = 0;
|
||||||
|
|
||||||
|
for (int y = 0; y < ph; y += grid_window) {
|
||||||
|
for (int x = 0; x < pw; x += grid_window) {
|
||||||
|
const int win_h = std::min(grid_window, ph - y);
|
||||||
|
const int win_w = std::min(grid_window, pw - x);
|
||||||
|
const int dst_0 = dst;
|
||||||
|
// group all tokens belong to the same window togather (to a continue range)
|
||||||
|
for (int dy = 0; dy < win_h; dy++) {
|
||||||
|
for (int dx = 0; dx < win_w; dx++) {
|
||||||
|
const int src = (y + dy) * pw + (x + dx);
|
||||||
|
GGML_ASSERT(src < (int)idx.size());
|
||||||
|
GGML_ASSERT(dst < (int)inv_idx.size());
|
||||||
|
idx [src] = dst;
|
||||||
|
inv_idx[dst] = src;
|
||||||
|
dst++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
||||||
|
int row_offset = mask_row * (ipw * iph);
|
||||||
|
std::fill(
|
||||||
|
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
||||||
|
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
||||||
|
0.0);
|
||||||
|
mask_row++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
set_input_i32("window_idx", idx);
|
||||||
|
set_input_i32("inv_window_idx", inv_idx);
|
||||||
|
set_input_f32("window_mask", mask);
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < ph * pw; i++) {
|
||||||
|
idx[i] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int mpow = merge_ratio * merge_ratio;
|
||||||
|
std::vector<int> positions(num_positions * 4);
|
||||||
|
|
||||||
|
int ptr = 0;
|
||||||
|
for (int y = 0; y < iph; y += merge_ratio) {
|
||||||
|
for (int x = 0; x < ipw; x += merge_ratio) {
|
||||||
|
for (int dy = 0; dy < 2; dy++) {
|
||||||
|
for (int dx = 0; dx < 2; dx++) {
|
||||||
|
auto remap = idx[ptr / mpow];
|
||||||
|
remap = (remap * mpow) + (ptr % mpow);
|
||||||
|
|
||||||
|
positions[ remap] = y + dy;
|
||||||
|
positions[ num_patches + remap] = x + dx;
|
||||||
|
positions[2 * num_patches + remap] = y + dy;
|
||||||
|
positions[3 * num_patches + remap] = x + dx;
|
||||||
|
ptr++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
|
set_input_i32("positions", positions);
|
||||||
}
|
} break;
|
||||||
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
case PROJECTOR_TYPE_PIXTRAL:
|
||||||
// do nothing
|
{
|
||||||
}
|
// set the 2D positions
|
||||||
else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
int n_patches_per_col = image_size_width / patch_size;
|
||||||
// do nothing
|
std::vector<int> pos_data(num_positions);
|
||||||
}
|
// dimension H
|
||||||
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
for (int i = 0; i < num_positions; i++) {
|
||||||
// set the 2D positions
|
pos_data[i] = i / n_patches_per_col;
|
||||||
int n_patches_per_col = image_size_width / patch_size;
|
}
|
||||||
std::vector<int> pos_data(num_positions);
|
set_input_i32("pos_h", pos_data);
|
||||||
struct ggml_tensor * pos;
|
// dimension W
|
||||||
// dimension H
|
for (int i = 0; i < num_positions; i++) {
|
||||||
pos = ggml_graph_get_tensor(gf, "pos_h");
|
pos_data[i] = i % n_patches_per_col;
|
||||||
for (int i = 0; i < num_positions; i++) {
|
}
|
||||||
pos_data[i] = i / n_patches_per_col;
|
set_input_i32("pos_w", pos_data);
|
||||||
}
|
} break;
|
||||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
// dimension W
|
{
|
||||||
pos = ggml_graph_get_tensor(gf, "pos_w");
|
|
||||||
for (int i = 0; i < num_positions; i++) {
|
|
||||||
pos_data[i] = i % n_patches_per_col;
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// llava and other models
|
// llava and other models
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
std::vector<int32_t> positions(num_positions);
|
||||||
|
|
||||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
||||||
for (int i = 0; i < num_positions; i++) {
|
for (int i = 0; i < num_positions; i++) {
|
||||||
positions_data[i] = i;
|
positions[i] = i;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
set_input_i32("positions", positions);
|
||||||
free(positions_data);
|
} break;
|
||||||
|
case PROJECTOR_TYPE_MLP:
|
||||||
|
case PROJECTOR_TYPE_MLP_NORM:
|
||||||
|
case PROJECTOR_TYPE_LDP:
|
||||||
|
case PROJECTOR_TYPE_LDPV2:
|
||||||
|
{
|
||||||
|
// llava and other models
|
||||||
|
std::vector<int32_t> positions(num_positions);
|
||||||
|
for (int i = 0; i < num_positions; i++) {
|
||||||
|
positions[i] = i;
|
||||||
|
}
|
||||||
|
set_input_i32("positions", positions);
|
||||||
|
|
||||||
if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) {
|
|
||||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
||||||
// The patches vector is used to get rows to index into the embeds with;
|
// The patches vector is used to get rows to index into the embeds with;
|
||||||
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
||||||
// when retrieving the rows.
|
// when retrieving the rows.
|
||||||
int patch_offset = model.class_embedding ? 1 : 0;
|
int patch_offset = model.class_embedding ? 1 : 0;
|
||||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
std::vector<int32_t> patches(num_patches);
|
||||||
for (int i = 0; i < num_patches; i++) {
|
for (int i = 0; i < num_patches; i++) {
|
||||||
patches_data[i] = i + patch_offset;
|
patches[i] = i + patch_offset;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
set_input_i32("patches", patches);
|
||||||
free(patches_data);
|
} break;
|
||||||
}
|
case PROJECTOR_TYPE_GEMMA3:
|
||||||
}
|
case PROJECTOR_TYPE_IDEFICS3:
|
||||||
}
|
|
||||||
|
|
||||||
if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
|
|
||||||
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
|
|
||||||
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
|
|
||||||
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
|
|
||||||
|
|
||||||
const int merge_ratio = 2;
|
|
||||||
const int attn_window_size = 112;
|
|
||||||
const int pw = image_size_width / patch_size / merge_ratio;
|
|
||||||
const int ph = image_size_height / patch_size / merge_ratio;
|
|
||||||
const int grid_window = attn_window_size / patch_size / merge_ratio;
|
|
||||||
const int ipw = image_size_width / patch_size;
|
|
||||||
const int iph = image_size_height / patch_size;
|
|
||||||
/*
|
|
||||||
pw * ph = number of tokens output by ViT after apply patch merger
|
|
||||||
ipw * ipw = number of vision token been processed inside ViT
|
|
||||||
*/
|
|
||||||
|
|
||||||
std::vector<int> idx(ph * pw);
|
|
||||||
std::vector<int> inv_idx(ph * pw);
|
|
||||||
int dst = 0;
|
|
||||||
// [num_vision_tokens, num_vision_tokens] attention mask tensor
|
|
||||||
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
|
|
||||||
int mask_row = 0;
|
|
||||||
|
|
||||||
for (int y = 0; y < ph; y+=grid_window)
|
|
||||||
{
|
|
||||||
for (int x = 0; x < pw; x+=grid_window)
|
|
||||||
{
|
{
|
||||||
const int win_h = std::min(grid_window, ph - y);
|
// do nothing
|
||||||
const int win_w = std::min(grid_window, pw - x);
|
} break;
|
||||||
const int dst_0 = dst;
|
default:
|
||||||
// group all tokens belong to the same window togather (to a continue range)
|
GGML_ABORT("Unknown projector type");
|
||||||
for (int dy = 0; dy < win_h; dy++) {
|
|
||||||
for (int dx = 0; dx < win_w; dx++) {
|
|
||||||
const int src = (y + dy) * pw + (x + dx);
|
|
||||||
assert(src < (int)idx.size());
|
|
||||||
assert(dst < (int)inv_idx.size());
|
|
||||||
idx[src] = dst;
|
|
||||||
inv_idx[dst] = src;
|
|
||||||
dst++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
|
|
||||||
int row_offset = mask_row * (ipw * iph);
|
|
||||||
std::fill(
|
|
||||||
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
|
|
||||||
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
|
|
||||||
0.0);
|
|
||||||
mask_row++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
|
|
||||||
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
|
|
||||||
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
|
@ -3695,7 +3680,7 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||||
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL;
|
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||||
|
|
|
@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||||
|
|
||||||
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||||
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||||
|
@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
|
||||||
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
"use clip_n_output_tokens instead");
|
||||||
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
|
||||||
|
"use clip_n_output_tokens instead");
|
||||||
|
|
||||||
|
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
|
|
||||||
|
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||||
|
// for other models, X will be the total number of tokens and Y will be 1
|
||||||
|
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
|
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
|
|
||||||
|
// this should be equal to the embedding dimension of the text model
|
||||||
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
|
|
|
@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||||
}
|
}
|
||||||
|
|
||||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
|
||||||
struct {
|
struct {
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
} model;
|
} model;
|
||||||
|
@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
|
|
||||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
|
||||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||||
// fill it with the image embeddings, ignoring the base
|
// fill it with the image embeddings, ignoring the base
|
||||||
for (size_t i = 1; i < num_images; i++) {
|
for (size_t i = 1; i < num_images; i++) {
|
||||||
|
@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
|
||||||
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
|
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
|
||||||
|
|
||||||
// Debug: Test single segments
|
// Debug: Test single segments
|
||||||
// Current findings: sending base image, sending a segment embedding all works similar to python
|
// Current findings: sending base image, sending a segment embedding all works similar to python
|
||||||
|
@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||||
image_embd_v[i],
|
image_embd_v[i],
|
||||||
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
|
||||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
|
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
|
||||||
}
|
}
|
||||||
*n_img_pos = n_img_pos_out;
|
*n_img_pos = n_img_pos_out;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
|
@ -352,8 +352,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
}
|
}
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
|
||||||
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||||
|
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_ERR("Unable to encode image\n");
|
LOG_ERR("Unable to encode image\n");
|
||||||
|
@ -391,7 +391,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
||||||
|
|
||||||
int n_img_pos_out;
|
int n_img_pos_out;
|
||||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
|
||||||
|
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
|
||||||
*n_img_pos = n_img_pos_out;
|
*n_img_pos = n_img_pos_out;
|
||||||
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
|
|
|
@ -136,39 +136,6 @@ struct mtmd_cli_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct decode_embd_batch {
|
|
||||||
std::vector<llama_pos> pos;
|
|
||||||
std::vector<int32_t> n_seq_id;
|
|
||||||
std::vector<llama_seq_id> seq_id_0;
|
|
||||||
std::vector<llama_seq_id *> seq_ids;
|
|
||||||
std::vector<int8_t> logits;
|
|
||||||
llama_batch batch;
|
|
||||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
||||||
pos .resize(n_tokens);
|
|
||||||
n_seq_id.resize(n_tokens);
|
|
||||||
seq_ids .resize(n_tokens + 1);
|
|
||||||
logits .resize(n_tokens);
|
|
||||||
seq_id_0.resize(1);
|
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
seq_ids [n_tokens] = nullptr;
|
|
||||||
batch = {
|
|
||||||
/*n_tokens =*/ n_tokens,
|
|
||||||
/*tokens =*/ nullptr,
|
|
||||||
/*embd =*/ embd,
|
|
||||||
/*pos =*/ pos.data(),
|
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
|
||||||
/*seq_id =*/ seq_ids.data(),
|
|
||||||
/*logits =*/ logits.data(),
|
|
||||||
};
|
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
|
||||||
batch.pos [i] = pos_0 + i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
|
||||||
batch.logits [i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
|
||||||
llama_tokens generated_tokens;
|
llama_tokens generated_tokens;
|
||||||
for (int i = 0; i < n_predict; i++) {
|
for (int i = 0; i < n_predict; i++) {
|
||||||
|
@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
|
ctx.n_past += mtmd_helper_get_n_pos(chunks);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (g_is_interrupted) LOG("\nInterrupted by user\n");
|
if (g_is_interrupted) LOG("\nInterrupted by user\n");
|
||||||
|
LOG("\n\n");
|
||||||
llama_perf_context_print(ctx.lctx);
|
llama_perf_context_print(ctx.lctx);
|
||||||
return g_is_interrupted ? 130 : 0;
|
return g_is_interrupted ? 130 : 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,11 +40,14 @@ struct mtmd_context {
|
||||||
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
|
||||||
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
|
||||||
|
|
||||||
|
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
|
||||||
|
|
||||||
// TODO @ngxson : add timings
|
// TODO @ngxson : add timings
|
||||||
|
|
||||||
mtmd_context(const char * mmproj_fname,
|
mtmd_context(const char * mmproj_fname,
|
||||||
const llama_model * text_model,
|
const llama_model * text_model,
|
||||||
const mtmd_context_params & ctx_params) :
|
const mtmd_context_params & ctx_params) :
|
||||||
|
text_model (text_model),
|
||||||
print_timings(ctx_params.print_timings),
|
print_timings(ctx_params.print_timings),
|
||||||
n_threads (ctx_params.n_threads),
|
n_threads (ctx_params.n_threads),
|
||||||
image_marker (ctx_params.image_marker)
|
image_marker (ctx_params.image_marker)
|
||||||
|
@ -56,9 +59,8 @@ struct mtmd_context {
|
||||||
if (!ctx_clip) {
|
if (!ctx_clip) {
|
||||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||||
}
|
}
|
||||||
this->text_model = text_model;
|
|
||||||
|
|
||||||
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
|
use_mrope = clip_is_qwen2vl(ctx_clip);
|
||||||
|
|
||||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||||
if (minicpmv_version == 2) {
|
if (minicpmv_version == 2) {
|
||||||
|
@ -126,6 +128,7 @@ struct mtmd_image_tokens_data {
|
||||||
struct mtmd_image_tokens {
|
struct mtmd_image_tokens {
|
||||||
uint32_t nx; // number of tokens in x direction
|
uint32_t nx; // number of tokens in x direction
|
||||||
uint32_t ny; // number of tokens in y direction
|
uint32_t ny; // number of tokens in y direction
|
||||||
|
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
|
||||||
uint32_t n_tokens() const { return nx * ny; }
|
uint32_t n_tokens() const { return nx * ny; }
|
||||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||||
|
@ -202,10 +205,14 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||||
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||||
}
|
}
|
||||||
|
|
||||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||||
// for glm-edge, we don't need to add because the tokens are already in the returned embeddings
|
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||||
|
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
|
||||||
|
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
|
||||||
|
|
||||||
// TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
|
}
|
||||||
|
|
||||||
|
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||||
|
|
||||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
|
||||||
output.clear();
|
output.clear();
|
||||||
|
@ -229,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||||
|
|
||||||
for (auto & entry : batch_f32.entries) {
|
for (auto & entry : batch_f32.entries) {
|
||||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||||
image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
|
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||||
image_tokens->ny = 1;
|
image_tokens->ny = 1;
|
||||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||||
image_tokens->id = id;
|
image_tokens->id = id;
|
||||||
|
@ -246,7 +253,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto & part : parts) {
|
for (const auto & part : parts) {
|
||||||
//printf("tokenizing part: %s\n", part.c_str());
|
// printf("tokenizing part: %s\n", part.c_str());
|
||||||
bool add_bos = &parts.front() == ∂
|
bool add_bos = &parts.front() == ∂
|
||||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
|
||||||
if (tokens.empty()) {
|
if (tokens.empty()) {
|
||||||
|
@ -325,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||||
} else {
|
} else {
|
||||||
size_t n_tokens = 0;
|
size_t n_tokens = 0;
|
||||||
for (const auto & entry : batch_f32.entries) {
|
for (const auto & entry : batch_f32.entries) {
|
||||||
n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
|
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||||
image_tokens->nx = n_tokens;
|
if (ctx->use_mrope) {
|
||||||
image_tokens->ny = 1; // TODO
|
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||||
|
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||||
|
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||||
|
image_tokens->use_mrope_pos = true;
|
||||||
|
} else {
|
||||||
|
// other models, we only need the total number of tokens
|
||||||
|
image_tokens->nx = n_tokens;
|
||||||
|
image_tokens->ny = 1;
|
||||||
|
}
|
||||||
image_tokens->batch_f32 = std::move(batch_f32);
|
image_tokens->batch_f32 = std::move(batch_f32);
|
||||||
image_tokens->id = bitmaps[i_img].id; // optional
|
image_tokens->id = bitmaps[i_img].id; // optional
|
||||||
|
|
||||||
|
@ -338,11 +353,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||||
|
|
||||||
if (clip_is_glm(ctx->ctx_clip)) {
|
|
||||||
// glm-edge
|
|
||||||
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
|
|
||||||
}
|
|
||||||
|
|
||||||
mtmd_input_chunk chunk{
|
mtmd_input_chunk chunk{
|
||||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||||
{},
|
{},
|
||||||
|
@ -380,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||||
return image_tokens->id;
|
return image_tokens->id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
||||||
|
if (image_tokens->use_mrope_pos) {
|
||||||
|
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
||||||
|
}
|
||||||
|
return image_tokens->n_tokens();
|
||||||
|
}
|
||||||
|
|
||||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||||
|
@ -397,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||||
const auto & entries = image_tokens->batch_f32.entries;
|
const auto & entries = image_tokens->batch_f32.entries;
|
||||||
for (size_t i = 0; i < entries.size(); i++) {
|
for (size_t i = 0; i < entries.size(); i++) {
|
||||||
int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
|
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
||||||
ok = clip_image_encode(
|
ok = clip_image_encode(
|
||||||
ctx->ctx_clip,
|
ctx->ctx_clip,
|
||||||
ctx->n_threads,
|
ctx->n_threads,
|
||||||
|
@ -425,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||||
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||||
n_tokens += chunk.tokens_text.size();
|
n_tokens += chunk.tokens_text.size();
|
||||||
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||||
n_tokens += chunk.tokens_image->n_tokens();
|
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false && "chunk type not supported");
|
GGML_ASSERT(false && "chunk type not supported");
|
||||||
}
|
}
|
||||||
|
@ -433,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
|
||||||
return n_tokens;
|
return n_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
|
||||||
|
llama_pos n_pos = 0;
|
||||||
|
for (auto & chunk : chunks) {
|
||||||
|
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||||
|
n_pos += chunk.tokens_text.size();
|
||||||
|
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||||
|
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "chunk type not supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return n_pos;
|
||||||
|
}
|
||||||
|
|
||||||
// helper struct to make working with embd batch easier
|
// helper struct to make working with embd batch easier
|
||||||
// note: this will be removed after llama_batch_ext refactoring
|
// note: this will be removed after llama_batch_ext refactoring
|
||||||
struct decode_embd_batch {
|
struct decode_embd_batch {
|
||||||
|
int n_pos_per_embd;
|
||||||
|
int n_mmproj_embd;
|
||||||
std::vector<llama_pos> pos;
|
std::vector<llama_pos> pos;
|
||||||
|
std::vector<llama_pos> pos_view; // used by mrope
|
||||||
std::vector<int32_t> n_seq_id;
|
std::vector<int32_t> n_seq_id;
|
||||||
std::vector<llama_seq_id> seq_id_0;
|
std::vector<llama_seq_id> seq_id_0;
|
||||||
std::vector<llama_seq_id *> seq_ids;
|
std::vector<llama_seq_id *> seq_ids;
|
||||||
std::vector<int8_t> logits;
|
std::vector<int8_t> logits;
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||||
pos .resize(n_tokens);
|
pos .resize(n_tokens * n_pos_per_embd);
|
||||||
n_seq_id.resize(n_tokens);
|
n_seq_id.resize(n_tokens);
|
||||||
seq_ids .resize(n_tokens + 1);
|
seq_ids .resize(n_tokens + 1);
|
||||||
logits .resize(n_tokens);
|
logits .resize(n_tokens);
|
||||||
seq_id_0.resize(1);
|
seq_id_0.resize(1);
|
||||||
seq_id_0[0] = seq_id;
|
|
||||||
seq_ids [n_tokens] = nullptr;
|
seq_ids [n_tokens] = nullptr;
|
||||||
batch = {
|
batch = {
|
||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
|
@ -459,13 +492,64 @@ struct decode_embd_batch {
|
||||||
/*seq_id =*/ seq_ids.data(),
|
/*seq_id =*/ seq_ids.data(),
|
||||||
/*logits =*/ logits.data(),
|
/*logits =*/ logits.data(),
|
||||||
};
|
};
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
}
|
||||||
|
|
||||||
|
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||||
|
seq_id_0[0] = seq_id;
|
||||||
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
batch.pos [i] = pos_0 + i;
|
batch.pos [i] = pos_0 + i;
|
||||||
batch.n_seq_id[i] = 1;
|
batch.n_seq_id[i] = 1;
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
batch.logits [i] = false;
|
batch.logits [i] = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||||
|
GGML_ASSERT(n_pos_per_embd == 4);
|
||||||
|
seq_id_0[0] = seq_id;
|
||||||
|
for (int y = 0; y < ny; y++) {
|
||||||
|
for (int x = 0; x < nx; x++) {
|
||||||
|
int i = y * nx + x;
|
||||||
|
pos[i ] = pos_0;
|
||||||
|
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||||
|
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||||
|
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
|
batch.logits [i] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch get_view(int offset, int n_tokens) {
|
||||||
|
llama_pos * pos_ptr;
|
||||||
|
pos_view.clear();
|
||||||
|
pos_view.resize(n_tokens * n_pos_per_embd);
|
||||||
|
if (n_pos_per_embd > 1) {
|
||||||
|
// mrope
|
||||||
|
// for example, with layout of src: 1234...1234...1234...1234...
|
||||||
|
// offset 2 will give us dst: 34...34...34...34...
|
||||||
|
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||||
|
auto src = pos.begin() + i * batch.n_tokens + offset;
|
||||||
|
pos_view.insert(pos_view.end(), src, src + n_tokens);
|
||||||
|
}
|
||||||
|
pos_ptr = pos_view.data();
|
||||||
|
} else {
|
||||||
|
// normal
|
||||||
|
pos_ptr = pos.data() + offset;
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
/*n_tokens =*/ n_tokens,
|
||||||
|
/*tokens =*/ nullptr,
|
||||||
|
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||||
|
/*pos =*/ pos_ptr,
|
||||||
|
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||||
|
/*seq_id =*/ batch.seq_id + offset,
|
||||||
|
/*logits =*/ batch.logits + offset,
|
||||||
|
};
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||||
|
@ -478,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||||
llama_pos n_past = pos0;
|
llama_pos n_past = pos0;
|
||||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||||
|
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||||
|
|
||||||
for (auto & chunk : chunks) {
|
for (auto & chunk : chunks) {
|
||||||
bool is_last = &chunk == &chunks.back();
|
bool is_last = &chunk == &chunks.back();
|
||||||
|
@ -525,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||||
int32_t i_batch = 0;
|
int32_t i_batch = 0;
|
||||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||||
float * embd = mtmd_get_output_embd(ctx);
|
float * embd = mtmd_get_output_embd(ctx);
|
||||||
|
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||||
|
|
||||||
|
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||||
|
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||||
|
|
||||||
|
if (mtmd_decode_use_mrope(ctx)) {
|
||||||
|
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||||
|
} else {
|
||||||
|
batch_embd.set_position_normal(n_past, seq_id);
|
||||||
|
}
|
||||||
|
|
||||||
if (mtmd_decode_use_non_causal(ctx)) {
|
if (mtmd_decode_use_non_causal(ctx)) {
|
||||||
llama_set_causal_attn(lctx, false);
|
llama_set_causal_attn(lctx, false);
|
||||||
|
@ -532,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
while (i_batch < n_img_batches) { // split into batches
|
while (i_batch < n_img_batches) { // split into batches
|
||||||
int32_t pos_offset = i_batch*n_batch;
|
int pos_offset = i_batch*n_batch;
|
||||||
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||||
float * embd_batch = embd + pos_offset*n_mmproj_embd;
|
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||||
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
|
|
||||||
|
|
||||||
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
|
||||||
|
|
||||||
int64_t t1 = ggml_time_ms();
|
int64_t t1 = ggml_time_ms();
|
||||||
ret = llama_decode(lctx, batch_img.batch);
|
ret = llama_decode(lctx, batch_embd_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_ERR("failed to decode image\n");
|
LOG_ERR("failed to decode image\n");
|
||||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||||
|
@ -553,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
i_batch++;
|
i_batch++;
|
||||||
n_past += n_tokens_batch;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for mrope, one image is one single **temporal** position
|
||||||
|
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
|
||||||
|
|
||||||
if (mtmd_decode_use_non_causal(ctx)) {
|
if (mtmd_decode_use_non_causal(ctx)) {
|
||||||
llama_set_causal_attn(lctx, true);
|
llama_set_causal_attn(lctx, true);
|
||||||
}
|
}
|
||||||
|
@ -603,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||||
|
return ctx->use_mrope;
|
||||||
|
}
|
||||||
|
|
||||||
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
|
||||||
mtmd_image_tokens_free(val);
|
mtmd_image_tokens_free(val);
|
||||||
}
|
}
|
||||||
|
|
|
@ -102,6 +102,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
|
||||||
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
|
||||||
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
|
||||||
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
|
||||||
|
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
|
||||||
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
|
||||||
|
|
||||||
// returns 0 on success
|
// returns 0 on success
|
||||||
|
@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||||
// whether we need to set non-causal mask before llama_decode
|
// whether we need to set non-causal mask before llama_decode
|
||||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||||
|
|
||||||
|
// whether the current model use M-RoPE for llama_decode
|
||||||
|
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// helper functions (can be implemented based on other functions)
|
// helper functions (can be implemented based on other functions)
|
||||||
//
|
//
|
||||||
|
|
||||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||||
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
|
||||||
|
|
||||||
|
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||||
|
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
|
||||||
|
|
||||||
// helper function that automatically:
|
// helper function that automatically:
|
||||||
// 1. run llama_decode() on text chunks
|
// 1. run llama_decode() on text chunks
|
||||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||||
|
|
|
@ -27,6 +27,8 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
|
||||||
|
// IT IS NOT A PRODUCTION CODE
|
||||||
|
|
||||||
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
||||||
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
||||||
|
@ -92,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
|
||||||
|
|
||||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
||||||
int N = (int) tokens.size();
|
int N = (int) tokens.size();
|
||||||
std::vector<llama_pos> pos;
|
|
||||||
for (int i = 0; i < N; i += n_batch) {
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
int n_eval = (int) tokens.size() - i;
|
int n_eval = (int) tokens.size() - i;
|
||||||
if (n_eval > n_batch) {
|
if (n_eval > n_batch) {
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
||||||
// TODO: add mrope pos ids somewhere else
|
|
||||||
pos.resize(batch.n_tokens * 4);
|
|
||||||
std::fill(pos.begin(), pos.end(), 0);
|
|
||||||
for (int j = 0; j < batch.n_tokens * 3; j ++) {
|
|
||||||
pos[j] = *st_pos_id + (j % batch.n_tokens);
|
|
||||||
}
|
|
||||||
batch.pos = pos.data();
|
|
||||||
|
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
@ -54,8 +54,8 @@ add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||||
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||||
add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||||
|
|
||||||
# to test the big models, run: ./tests.sh big
|
# to test the big models, run: ./tests.sh big
|
||||||
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||||
|
|
|
@ -399,8 +399,8 @@ extern "C" {
|
||||||
|
|
||||||
// precision
|
// precision
|
||||||
enum ggml_prec {
|
enum ggml_prec {
|
||||||
GGML_PREC_DEFAULT,
|
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
||||||
GGML_PREC_F32,
|
GGML_PREC_F32 = 10,
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
|
|
|
@ -78,13 +78,13 @@
|
||||||
// Moore Threads
|
// Moore Threads
|
||||||
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
||||||
|
|
||||||
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||||
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||||
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||||
|
|
||||||
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
||||||
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
||||||
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
|
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
|
||||||
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH_LIST__
|
#ifdef __CUDA_ARCH_LIST__
|
||||||
|
|
|
@ -1936,8 +1936,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
||||||
} else if (!split && use_mul_mat_vec_q) {
|
} else if (!split && use_mul_mat_vec_q) {
|
||||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
||||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// general KQ + KQV multi-batch without FlashAttention
|
// general KQ + KQV multi-batch without FlashAttention
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_mul_mat_vec) {
|
} else if (use_mul_mat_vec) {
|
||||||
|
|
|
@ -104,6 +104,7 @@ class Keys:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
||||||
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
||||||
|
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
@ -267,6 +268,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
|
NOMIC_BERT_MOE = auto()
|
||||||
JINA_BERT_V2 = auto()
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
@ -521,6 +523,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
@ -960,6 +963,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NOMIC_BERT_MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.JINA_BERT_V2: [
|
MODEL_ARCH.JINA_BERT_V2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
|
|
@ -728,6 +728,9 @@ class GGUFWriter:
|
||||||
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
def add_moe_every_n_layers(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_swin_norm(self, value: bool) -> None:
|
def add_swin_norm(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
|
|
@ -290,6 +290,7 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||||
"language_model.model.layers.{bid}.feed_forward.router", # llama4
|
"language_model.model.layers.{bid}.feed_forward.router", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
@ -322,6 +323,7 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
|
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
|
@ -337,6 +339,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||||
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
|
@ -418,6 +421,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||||
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
|
|
|
@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
|
@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||||
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_NOMIC_BERT_MOE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
{
|
{
|
||||||
|
|
|
@ -23,6 +23,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT,
|
LLM_ARCH_NOMIC_BERT,
|
||||||
|
LLM_ARCH_NOMIC_BERT_MOE,
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
|
@ -110,6 +111,7 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||||
LLM_KV_EXPERT_GATING_FUNC,
|
LLM_KV_EXPERT_GATING_FUNC,
|
||||||
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
|
|
@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
||||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
||||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||||
|
@ -123,7 +123,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||||
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
||||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||||
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||||
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
||||||
|
@ -156,7 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
||||||
} else if (tmpl_contains("[gMASK]sop")) {
|
} else if (tmpl_contains("[gMASK]sop")) {
|
||||||
// chatglm3-6b
|
// chatglm3-6b
|
||||||
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
||||||
} else if (tmpl_contains(LU8("<用户>"))) {
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
||||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||||
|
@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
||||||
// chatglm3-6b
|
// chatglm3-6b
|
||||||
ss << "[gMASK]" << "sop";
|
ss << "[gMASK]" << "sop";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||||
ss << "[gMASK]" << "<sop>";
|
ss << "[gMASK]" << "<sop>";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
|
||||||
for (auto message : chat) {
|
|
||||||
std::string role(message->role);
|
|
||||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
|
||||||
}
|
|
||||||
if (add_ass) {
|
|
||||||
ss << "<|assistant|>";
|
|
||||||
}
|
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
|
|
@ -29,8 +29,8 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
||||||
LLM_CHAT_TEMPLATE_CHATGML_4,
|
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
||||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||||
LLM_CHAT_TEMPLATE_MINICPM,
|
LLM_CHAT_TEMPLATE_MINICPM,
|
||||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||||
|
|
|
@ -1537,8 +1537,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
// set all ids as invalid (negative)
|
// set all ids as invalid (negative)
|
||||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||||
|
|
||||||
ggml_backend_buffer_clear(buf_output.get(), 0);
|
|
||||||
|
|
||||||
this->n_outputs = 0;
|
this->n_outputs = 0;
|
||||||
this->n_outputs_max = n_outputs_max;
|
this->n_outputs_max = n_outputs_max;
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||||
if (ubatch->pos && pos) {
|
if (ubatch->pos && pos) {
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
|
|
||||||
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
|
if (ubatch->token && n_pos_per_embd == 4) {
|
||||||
|
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
|
||||||
|
// the 3 first dims are the same, and 4th dim is all 0
|
||||||
|
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
|
||||||
|
// copy the first dimension
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
pos_data[ i] = ubatch->pos[i];
|
||||||
|
pos_data[ n_tokens + i] = ubatch->pos[i];
|
||||||
|
pos_data[2 * n_tokens + i] = ubatch->pos[i];
|
||||||
|
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
|
||||||
|
} else {
|
||||||
|
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||||
) * f_attn_temp_scale + 1.0;
|
) * f_attn_temp_scale + 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||||
res (std::make_unique<llm_graph_result>()) {
|
res (std::make_unique<llm_graph_result>()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t llm_graph_context::n_pos_per_token() const {
|
int64_t llm_graph_context::n_pos_per_embd() const {
|
||||||
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -914,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||||
cb(up, "ffn_moe_up", il);
|
cb(up, "ffn_moe_up", il);
|
||||||
|
|
||||||
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
ggml_tensor * experts = nullptr;
|
||||||
cb(gate, "ffn_moe_gate", il);
|
if (gate_exps) {
|
||||||
|
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||||
|
cb(cur, "ffn_moe_gate", il);
|
||||||
|
} else {
|
||||||
|
cur = up;
|
||||||
|
}
|
||||||
|
|
||||||
switch (type_op) {
|
switch (type_op) {
|
||||||
case LLM_FFN_SILU:
|
case LLM_FFN_SILU:
|
||||||
{
|
{
|
||||||
gate = ggml_silu(ctx0, gate);
|
cur = ggml_silu(ctx0, cur);
|
||||||
cb(gate, "ffn_moe_silu", il);
|
cb(cur, "ffn_moe_silu", il);
|
||||||
} break;
|
} break;
|
||||||
case LLM_FFN_GELU:
|
case LLM_FFN_GELU:
|
||||||
{
|
{
|
||||||
gate = ggml_gelu(ctx0, gate);
|
cur = ggml_gelu(ctx0, cur);
|
||||||
cb(gate, "ffn_moe_gelu", il);
|
cb(cur, "ffn_moe_gelu", il);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
|
if (gate_exps) {
|
||||||
cb(par, "ffn_moe_gate_par", il);
|
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
||||||
|
cb(cur, "ffn_moe_gate_par", il);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||||
cb(experts, "ffn_moe_down", il);
|
cb(experts, "ffn_moe_down", il);
|
||||||
|
|
||||||
if (!weight_before_ffn) {
|
if (!weight_before_ffn) {
|
||||||
|
@ -1018,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
|
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
|
||||||
|
|
||||||
auto & cur = inp->pos;
|
auto & cur = inp->pos;
|
||||||
|
|
||||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
|
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
|
||||||
ggml_set_input(cur);
|
ggml_set_input(cur);
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
res->add_input(std::move(inp));
|
||||||
|
@ -1031,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||||
|
|
||||||
auto & cur = inp->attn_scale;
|
auto & cur = inp->attn_scale;
|
||||||
|
|
||||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
// this need to be 1x1xN for broadcasting
|
||||||
|
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
||||||
ggml_set_input(cur);
|
ggml_set_input(cur);
|
||||||
|
|
||||||
res->add_input(std::move(inp));
|
res->add_input(std::move(inp));
|
||||||
|
|
|
@ -90,29 +90,27 @@ public:
|
||||||
|
|
||||||
class llm_graph_input_pos : public llm_graph_input_i {
|
class llm_graph_input_pos : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
|
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
||||||
virtual ~llm_graph_input_pos() = default;
|
virtual ~llm_graph_input_pos() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||||
|
|
||||||
const int64_t n_pos_per_token = 1;
|
const int64_t n_pos_per_embd = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
// temperature tuning, used by llama4
|
// temperature tuning, used by llama4
|
||||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||||
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||||
virtual ~llm_graph_input_attn_temp() = default;
|
virtual ~llm_graph_input_attn_temp() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
||||||
|
|
||||||
const int64_t n_pos_per_token = 1;
|
|
||||||
|
|
||||||
const uint32_t n_attn_temp_floor_scale;
|
const uint32_t n_attn_temp_floor_scale;
|
||||||
const float f_attn_temp_scale;
|
const float f_attn_temp_scale;
|
||||||
};
|
};
|
||||||
|
@ -419,7 +417,7 @@ struct llm_graph_context {
|
||||||
|
|
||||||
llm_graph_context(const llm_graph_params & params);
|
llm_graph_context(const llm_graph_params & params);
|
||||||
|
|
||||||
int64_t n_pos_per_token() const;
|
int64_t n_pos_per_embd() const;
|
||||||
|
|
||||||
void cb(ggml_tensor * cur, const char * name, int il) const;
|
void cb(ggml_tensor * cur, const char * name, int il) const;
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ struct llama_hparams {
|
||||||
float expert_weights_scale = 0.0;
|
float expert_weights_scale = 0.0;
|
||||||
bool expert_weights_norm = false;
|
bool expert_weights_norm = false;
|
||||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||||
|
uint32_t moe_every_n_layers = 0;
|
||||||
|
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
|
|
|
@ -97,6 +97,10 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_290B: return "290B";
|
case LLM_TYPE_290B: return "290B";
|
||||||
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
||||||
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
||||||
|
case LLM_TYPE_0_6B: return "0.6B";
|
||||||
|
case LLM_TYPE_1_7B: return "1.7B";
|
||||||
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||||
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
||||||
default: return "?B";
|
default: return "?B";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -700,10 +704,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
||||||
|
|
||||||
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||||
type = LLM_TYPE_137M;
|
type = LLM_TYPE_137M;
|
||||||
|
@ -796,6 +802,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
||||||
|
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
||||||
|
case 40: type = LLM_TYPE_14B; break;
|
||||||
|
case 64: type = LLM_TYPE_32B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -805,6 +815,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 48: type = LLM_TYPE_30B_A3B; break;
|
||||||
|
case 94: type = LLM_TYPE_235B_A22B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -2152,6 +2164,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
||||||
|
@ -2185,20 +2198,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||||
|
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
|
||||||
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||||
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||||
|
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
||||||
|
|
||||||
if (arch == LLM_ARCH_BERT) {
|
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
||||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||||
|
|
||||||
|
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||||
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
||||||
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
||||||
|
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||||
|
} else {
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
@ -5833,6 +5857,11 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||||
cb(cur, "wqkv", il);
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||||
|
cb(cur, "bqkv", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
@ -5885,13 +5914,29 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
||||||
|
// MoE branch
|
||||||
|
cur = build_moe_ffn(cur,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
nullptr,
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
|
hparams.n_expert,
|
||||||
|
hparams.n_expert_used,
|
||||||
|
LLM_FFN_GELU,
|
||||||
|
false, false,
|
||||||
|
0.0f,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||||
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
NULL, NULL, NULL,
|
NULL, NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
model.layers[il].ffn_up, NULL, NULL,
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
@ -5899,6 +5944,7 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
} else {
|
} else {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
model.layers[il].ffn_up, NULL, NULL,
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
@ -5906,8 +5952,8 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
model.layers[il].ffn_down, NULL, NULL,
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
// attentions bypass the intermediate layer
|
// attentions bypass the intermediate layer
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
@ -10252,6 +10298,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
||||||
|
|
||||||
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
||||||
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
||||||
|
ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
|
||||||
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
||||||
|
|
||||||
// {kv_lora_rank, n_head, n_tokens}
|
// {kv_lora_rank, n_head, n_tokens}
|
||||||
|
@ -12945,6 +12992,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
||||||
} break;
|
} break;
|
||||||
|
@ -13303,6 +13351,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_DBRX:
|
case LLM_ARCH_DBRX:
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
case LLM_ARCH_BITNET:
|
case LLM_ARCH_BITNET:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
|
|
|
@ -88,6 +88,10 @@ enum llm_type {
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_17B_16E, // llama4 Scout
|
LLM_TYPE_17B_16E, // llama4 Scout
|
||||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||||
|
LLM_TYPE_0_6B,
|
||||||
|
LLM_TYPE_1_7B,
|
||||||
|
LLM_TYPE_30B_A3B,
|
||||||
|
LLM_TYPE_235B_A22B,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue