mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-23 04:19:08 +00:00
mmproj autofit reworked
This commit is contained in:
parent
de6b8f9369
commit
694e8824c5
2 changed files with 77 additions and 26 deletions
|
|
@ -2216,6 +2216,46 @@ void kcpp_init_audio_proj(clip_ctx * ctx_a)
|
|||
audio_preproc->initialize();
|
||||
}
|
||||
|
||||
clip_context_params init_clip_ctx_params(bool mmproj_cpu, bool dryrun)
|
||||
{
|
||||
#if defined(GGML_USE_METAL)
|
||||
if(file_format_meta.model_architecture == llm_arch::LLM_ARCH_QWEN2VL || file_format_meta.model_architecture == llm_arch::LLM_ARCH_GEMMA3)
|
||||
{
|
||||
mmproj_cpu = true;
|
||||
if(!dryrun)
|
||||
{
|
||||
set_clip_uses_gpu(false);
|
||||
printf("Clip will use CPU for this model!\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
clip_flash_attn_type clip_fa = (kcpp_data->flash_attn?CLIP_FLASH_ATTN_TYPE_ENABLED:CLIP_FLASH_ATTN_TYPE_DISABLED); //kcpp: disabled in 1.102.2 as some headsizes break on turing
|
||||
#if defined(GGML_USE_CUDA)
|
||||
clip_fa = CLIP_FLASH_ATTN_TYPE_DISABLED; //kcpp: disabled in 1.102.2 as some headsizes break on turing
|
||||
#endif
|
||||
if(mmproj_cpu)
|
||||
{
|
||||
if(!dryrun)
|
||||
{
|
||||
set_clip_uses_gpu(false);
|
||||
printf("Clip forced to use CPU!\n");
|
||||
}
|
||||
clip_fa = (kcpp_data->flash_attn?CLIP_FLASH_ATTN_TYPE_ENABLED:CLIP_FLASH_ATTN_TYPE_DISABLED); //however if using CPU, fa is fine
|
||||
}
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ true,
|
||||
/* flash_attn_type */ clip_fa,
|
||||
/* image_min_tokens */ kcpp_data->vision_min_tokens,
|
||||
/* image_max_tokens */ kcpp_data->vision_max_tokens,
|
||||
/* warmup */ false,
|
||||
/* cb_eval */ nullptr,
|
||||
/* cb_eval_user_data */ nullptr,
|
||||
/* no_alloc */ dryrun?true:false,
|
||||
};
|
||||
|
||||
return ctx_clip_params;
|
||||
}
|
||||
|
||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
|
||||
{
|
||||
is_quiet = inputs.quiet;
|
||||
|
|
@ -2659,8 +2699,41 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
rocblas_initialize();
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
size_t totalmmprojtax = 0;
|
||||
if(mmproj_filename != "" && file_format==FileFormat::GGUF_GENERIC && !inputs.mmproj_cpu)
|
||||
{
|
||||
printf("\nEstimating MMProj GPU usage...");
|
||||
auto saved_log_callback = g_logger_state.log_callback;
|
||||
auto saved_log_user_data = g_logger_state.log_callback_user_data;
|
||||
g_logger_state.log_callback = log_callback_off;
|
||||
g_logger_state.log_callback_user_data = nullptr;
|
||||
clip_context_params ctx_clip_params = init_clip_ctx_params(inputs.mmproj_cpu,true);
|
||||
clip_init_result cres = clip_init(mmproj_filename.c_str(), ctx_clip_params);
|
||||
g_logger_state.log_callback = saved_log_callback;
|
||||
g_logger_state.log_callback_user_data = saved_log_user_data;
|
||||
clip_ctx * test1 = cres.ctx_v;
|
||||
clip_ctx * test2 = cres.ctx_a;
|
||||
|
||||
if(test1)
|
||||
{
|
||||
for (auto & [dev, size] : clip_get_mem_usage(test1)) {
|
||||
totalmmprojtax += size;
|
||||
}
|
||||
clip_free(test1);
|
||||
}
|
||||
if(test2)
|
||||
{
|
||||
for (auto & [dev, size] : clip_get_mem_usage(test2)) {
|
||||
totalmmprojtax += size;
|
||||
}
|
||||
clip_free(test2);
|
||||
}
|
||||
totalmmprojtax = totalmmprojtax / (1024*1024);
|
||||
printf("MMProj Autofit Usage: %zu MB", totalmmprojtax);
|
||||
}
|
||||
|
||||
common_params temp_params;
|
||||
size_t taxmb = inputs.autofit_tax_mb;
|
||||
size_t taxmb = inputs.autofit_tax_mb + totalmmprojtax;
|
||||
printf("\nAttempting to use llama.cpp's automating fitting code. This will override all your layer configs, may or may not work!\n");
|
||||
//zero out any customizations made
|
||||
tenos.clear();
|
||||
|
|
@ -2821,29 +2894,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
if(mmproj_filename != "" && file_format==FileFormat::GGUF_GENERIC)
|
||||
{
|
||||
printf("\nAttempting to apply Multimodal Projector: %s\n", mmproj_filename.c_str());
|
||||
#if defined(GGML_USE_METAL)
|
||||
if(file_format_meta.model_architecture == llm_arch::LLM_ARCH_QWEN2VL || file_format_meta.model_architecture == llm_arch::LLM_ARCH_GEMMA3)
|
||||
{
|
||||
set_clip_uses_gpu(false);
|
||||
printf("Clip will use CPU for this model!\n");
|
||||
}
|
||||
#endif
|
||||
clip_flash_attn_type clip_fa = (kcpp_data->flash_attn?CLIP_FLASH_ATTN_TYPE_ENABLED:CLIP_FLASH_ATTN_TYPE_DISABLED); //kcpp: disabled in 1.102.2 as some headsizes break on turing
|
||||
#if defined(GGML_USE_CUDA)
|
||||
clip_fa = CLIP_FLASH_ATTN_TYPE_DISABLED; //kcpp: disabled in 1.102.2 as some headsizes break on turing
|
||||
#endif
|
||||
if(inputs.mmproj_cpu)
|
||||
{
|
||||
set_clip_uses_gpu(false);
|
||||
printf("Clip forced to use CPU!\n");
|
||||
clip_fa = (kcpp_data->flash_attn?CLIP_FLASH_ATTN_TYPE_ENABLED:CLIP_FLASH_ATTN_TYPE_DISABLED); //however if using CPU, fa is fine
|
||||
}
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ true,
|
||||
/* flash_attn_type */ clip_fa,
|
||||
/* image_min_tokens */ kcpp_data->vision_min_tokens,
|
||||
/* image_max_tokens */ kcpp_data->vision_max_tokens,
|
||||
};
|
||||
clip_context_params ctx_clip_params = init_clip_ctx_params(inputs.mmproj_cpu,false);
|
||||
clip_init_result cres = clip_init(mmproj_filename.c_str(), ctx_clip_params);
|
||||
clp_ctx_v = cres.ctx_v;
|
||||
clp_ctx_a = cres.ctx_a;
|
||||
|
|
|
|||
|
|
@ -1679,8 +1679,8 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, musiclowvram): #shitty algo to d
|
|||
calulated_gpu_overhead += 1024*1024*1024*(4.25 - sdquanted * 0.5) # 4.25, 3.75, 3.25
|
||||
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
|
||||
calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
|
||||
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
|
||||
calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
|
||||
# if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax (now internal to kcpp)
|
||||
# calulated_gpu_overhead += max(350*1024*1024,modelfile_extracted_meta[5]*1.5)
|
||||
if modelfile_extracted_meta[6] > 1024*1024*10: #draft model tax
|
||||
calulated_gpu_overhead += (modelfile_extracted_meta[6] * 1.5)
|
||||
if modelfile_extracted_meta[7] > 1024*1024*10: #tts model tax
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue