mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
mtmd : Fix MinicpmV model converter and clip to avoid using hardcode. (#14750)
* Fix MinicpmV model converter and clip to avoid using hardcode. * Code update for pr/14750 * Remove unused field, update script path in docs. * Add version 5 for fallback code. --------- Co-authored-by: lzhang <zhanglei@modelbest.cn>
This commit is contained in:
parent
fba5c0d680
commit
cf9e5648a7
6 changed files with 116 additions and 78 deletions
|
@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||||
|
|
||||||
Clone llama.cpp:
|
Clone llama.cpp:
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||||
|
|
||||||
Clone llama.cpp:
|
Clone llama.cpp:
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,7 @@
|
||||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||||
|
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||||
|
|
||||||
// audio-specific
|
// audio-specific
|
||||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||||
|
|
|
@ -201,6 +201,7 @@ struct clip_hparams {
|
||||||
// legacy
|
// legacy
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
int minicpmv_version = 0;
|
int minicpmv_version = 0;
|
||||||
|
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_layer {
|
struct clip_layer {
|
||||||
|
@ -866,21 +867,8 @@ struct clip_graph {
|
||||||
int n_embd = clip_n_mmproj_embd(ctx);
|
int n_embd = clip_n_mmproj_embd(ctx);
|
||||||
const int d_head = 128;
|
const int d_head = 128;
|
||||||
int n_head = n_embd/d_head;
|
int n_head = n_embd/d_head;
|
||||||
int num_query = 96;
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||||
if (ctx->model.hparams.minicpmv_version == 2) {
|
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||||
// MiniCPM-V 2.5
|
|
||||||
num_query = 96;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
|
||||||
// MiniCPM-V 2.6
|
|
||||||
num_query = 64;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
|
||||||
// MiniCPM-o 2.6
|
|
||||||
num_query = 64;
|
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 5) {
|
|
||||||
// MiniCPM-V 4.0
|
|
||||||
num_query = 64;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * Q = ggml_add(ctx0,
|
ggml_tensor * Q = ggml_add(ctx0,
|
||||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||||
model.mm_model_attn_q_b);
|
model.mm_model_attn_q_b);
|
||||||
|
@ -2138,7 +2126,19 @@ struct clip_model_loader {
|
||||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||||
|
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
|
||||||
|
if (hparams.minicpmv_query_num == 0) {
|
||||||
|
// Fallback to hardcoded values for legacy models
|
||||||
|
if (hparams.minicpmv_version == 3) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else if (hparams.minicpmv_version == 4) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else if (hparams.minicpmv_version == 5) {
|
||||||
|
hparams.minicpmv_query_num = 64;
|
||||||
|
} else {
|
||||||
|
hparams.minicpmv_query_num = 96;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (is_audio) {
|
} else if (is_audio) {
|
||||||
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
||||||
|
|
||||||
|
@ -3556,14 +3556,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
{
|
{
|
||||||
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||||
|
if (params.minicpmv_query_num > 0) {
|
||||||
|
n_patches_sq = params.minicpmv_query_num;
|
||||||
|
} else {
|
||||||
|
// Fallback to hardcoded values for legacy models
|
||||||
if (params.minicpmv_version == 2) {
|
if (params.minicpmv_version == 2) {
|
||||||
// MiniCPM-V 2.5
|
|
||||||
n_patches_sq = 96;
|
n_patches_sq = 96;
|
||||||
} else if (params.minicpmv_version == 3) {
|
} else if (params.minicpmv_version == 3) {
|
||||||
// MiniCPM-V 2.6
|
|
||||||
n_patches_sq = 64;
|
n_patches_sq = 64;
|
||||||
} else if (params.minicpmv_version == 4) {
|
} else if (params.minicpmv_version == 4) {
|
||||||
// MiniCPM-o 2.6
|
|
||||||
n_patches_sq = 64;
|
n_patches_sq = 64;
|
||||||
} else if (params.minicpmv_version == 5) {
|
} else if (params.minicpmv_version == 5) {
|
||||||
// MiniCPM-V 4.0
|
// MiniCPM-V 4.0
|
||||||
|
@ -3571,6 +3573,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
GGML_ABORT("Unknown minicpmv version");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
case PROJECTOR_TYPE_QWEN25VL:
|
case PROJECTOR_TYPE_QWEN25VL:
|
||||||
|
@ -4102,7 +4105,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
const auto & hparams = ctx->model.hparams;
|
|
||||||
switch (ctx->model.proj_type) {
|
switch (ctx->model.proj_type) {
|
||||||
case PROJECTOR_TYPE_LDP:
|
case PROJECTOR_TYPE_LDP:
|
||||||
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
||||||
|
@ -4114,20 +4116,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
case PROJECTOR_TYPE_MLP_NORM:
|
case PROJECTOR_TYPE_MLP_NORM:
|
||||||
return ctx->model.mm_3_b->ne[0];
|
return ctx->model.mm_3_b->ne[0];
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
if (hparams.minicpmv_version == 2) {
|
return ctx->model.mm_model_proj->ne[0];
|
||||||
// MiniCPM-V 2.5
|
|
||||||
return 4096;
|
|
||||||
} else if (hparams.minicpmv_version == 3) {
|
|
||||||
// MiniCPM-V 2.6
|
|
||||||
return 3584;
|
|
||||||
} else if (hparams.minicpmv_version == 4) {
|
|
||||||
// MiniCPM-o 2.6
|
|
||||||
return 3584;
|
|
||||||
} else if (hparams.minicpmv_version == 5) {
|
|
||||||
// MiniCPM-V 4.0
|
|
||||||
return 2560;
|
|
||||||
}
|
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
|
||||||
case PROJECTOR_TYPE_GLM_EDGE:
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
return ctx->model.mm_model_mlp_3_w->ne[1];
|
return ctx->model.mm_model_mlp_3_w->ne[1];
|
||||||
case PROJECTOR_TYPE_QWEN2VL:
|
case PROJECTOR_TYPE_QWEN2VL:
|
||||||
|
|
|
@ -517,6 +517,16 @@ if args.use_f32:
|
||||||
# output in the same directory as the model if output_dir is None
|
# output in the same directory as the model if output_dir is None
|
||||||
dir_model = args.model_dir
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
# Read config.json to get actual model configuration
|
||||||
|
config_path = os.path.join(dir_model, "config.json")
|
||||||
|
model_config = {}
|
||||||
|
if os.path.isfile(config_path):
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
model_config = json.load(f)
|
||||||
|
print(f"Loaded config from {config_path}")
|
||||||
|
else:
|
||||||
|
print(f"Warning: config.json not found at {config_path}")
|
||||||
|
|
||||||
# If minicpmv_projector is not specified but the default path exists, use the default path
|
# If minicpmv_projector is not specified but the default path exists, use the default path
|
||||||
if args.minicpmv_projector is None:
|
if args.minicpmv_projector is None:
|
||||||
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
||||||
|
@ -555,25 +565,50 @@ if args.use_f32:
|
||||||
# processor = CLIPProcessor.from_pretrained(dir_model)
|
# processor = CLIPProcessor.from_pretrained(dir_model)
|
||||||
|
|
||||||
minicpmv_version = args.minicpmv_version
|
minicpmv_version = args.minicpmv_version
|
||||||
emb_dim = 4096
|
|
||||||
block_count = 26
|
# Use actual config values instead of hardcoded ones
|
||||||
if minicpmv_version == 1: # MiniCPM-V 2.0
|
if model_config:
|
||||||
|
# For the projector/resampler, use the main model's hidden_size
|
||||||
|
emb_dim = model_config.get("hidden_size", 1536)
|
||||||
|
|
||||||
|
# For the vision model, use vision_config values
|
||||||
|
vision_config_dict = model_config.get("vision_config", {})
|
||||||
|
default_vision_config = {
|
||||||
|
"hidden_size": vision_config_dict.get("hidden_size", 1152),
|
||||||
|
"image_size": vision_config_dict.get("image_size", 980),
|
||||||
|
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
|
||||||
|
"model_type": vision_config_dict.get("model_type", "siglip"),
|
||||||
|
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
|
||||||
|
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
|
||||||
|
"patch_size": vision_config_dict.get("patch_size", 14),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Use vision model's num_hidden_layers for block_count
|
||||||
|
block_count = vision_config_dict.get("num_hidden_layers", 27)
|
||||||
|
|
||||||
|
print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
|
||||||
|
print(f"Vision config: {default_vision_config}")
|
||||||
|
else:
|
||||||
|
# Fallback to original hardcoded logic if config.json not found
|
||||||
|
emb_dim = 4096
|
||||||
|
block_count = 26
|
||||||
|
if minicpmv_version == 1:
|
||||||
emb_dim = 2304
|
emb_dim = 2304
|
||||||
block_count = 26
|
block_count = 26
|
||||||
elif minicpmv_version == 2: # MiniCPM-V 2.5
|
elif minicpmv_version == 2:
|
||||||
emb_dim = 4096
|
emb_dim = 4096
|
||||||
block_count = 27
|
block_count = 27
|
||||||
elif minicpmv_version == 3: # MiniCPM-V 2.6
|
elif minicpmv_version == 3:
|
||||||
emb_dim = 3584
|
emb_dim = 3584
|
||||||
block_count = 27
|
block_count = 27
|
||||||
elif minicpmv_version == 4: # MiniCPM-o 2.6
|
elif minicpmv_version == 4:
|
||||||
emb_dim = 3584
|
emb_dim = 3584
|
||||||
block_count = 27
|
block_count = 27
|
||||||
elif minicpmv_version == 5: # MiniCPM-V 4.0
|
elif minicpmv_version == 5:
|
||||||
emb_dim = 2560
|
emb_dim = 2560
|
||||||
block_count = 27
|
block_count = 27
|
||||||
|
|
||||||
default_vision_config = {
|
default_vision_config = {
|
||||||
"hidden_size": 1152,
|
"hidden_size": 1152,
|
||||||
"image_size": 980,
|
"image_size": 980,
|
||||||
"intermediate_size": 4304,
|
"intermediate_size": 4304,
|
||||||
|
@ -585,7 +620,7 @@ default_vision_config = {
|
||||||
|
|
||||||
vision_config = Idefics2VisionConfig(**default_vision_config)
|
vision_config = Idefics2VisionConfig(**default_vision_config)
|
||||||
model = Idefics2VisionTransformer(vision_config)
|
model = Idefics2VisionTransformer(vision_config)
|
||||||
if minicpmv_version == 3:
|
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
|
||||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||||
model = SiglipVisionTransformer(vision_config)
|
model = SiglipVisionTransformer(vision_config)
|
||||||
elif minicpmv_version == 4:
|
elif minicpmv_version == 4:
|
||||||
|
@ -644,16 +679,27 @@ else:
|
||||||
fout.add_description("two-tower CLIP model")
|
fout.add_description("two-tower CLIP model")
|
||||||
|
|
||||||
if has_vision_encoder:
|
if has_vision_encoder:
|
||||||
# vision_model hparams
|
# vision_model hparams - use actual config values
|
||||||
fout.add_uint32("clip.vision.image_size", 448)
|
vision_image_size = model_config.get("image_size", 448) if model_config else 448
|
||||||
fout.add_uint32("clip.vision.patch_size", 14)
|
vision_patch_size = default_vision_config.get("patch_size", 14)
|
||||||
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
|
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
|
||||||
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
|
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
|
||||||
|
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
|
||||||
|
|
||||||
|
fout.add_uint32("clip.vision.image_size", vision_image_size)
|
||||||
|
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
|
||||||
|
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
|
||||||
|
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
|
||||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
|
||||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
|
|
||||||
|
# Add MiniCPM-V specific parameters
|
||||||
|
query_num = model_config.get("query_num", 0) if model_config else 0
|
||||||
|
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
|
||||||
|
fout.add_uint32("clip.minicpmv_query_num", query_num)
|
||||||
|
|
||||||
if processor is not None:
|
if processor is not None:
|
||||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||||
|
|
|
@ -16,6 +16,8 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
|
||||||
|
|
||||||
# store these tensors in a new dictionary and torch.save them
|
# store these tensors in a new dictionary and torch.save them
|
||||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||||
|
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
|
||||||
|
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
|
||||||
torch.save(projector, f"{args.model}/minicpmv.projector")
|
torch.save(projector, f"{args.model}/minicpmv.projector")
|
||||||
|
|
||||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue