mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
vision can set max res (+1 squashed commits)
Squashed commits: [938fc655] vision can set max res
This commit is contained in:
parent
2f69432774
commit
70f1d8d746
6 changed files with 25 additions and 12 deletions
|
@ -1744,7 +1744,7 @@ uint8_t* scale_down_image(uint8_t* input_image, int& nx, int& ny, int nc, int ma
|
||||||
return resized_image;
|
return resized_image;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img, const int maxdims) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
|
@ -1753,7 +1753,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||||
}
|
}
|
||||||
|
|
||||||
float maxaspect = 4.0f;
|
float maxaspect = 4.0f;
|
||||||
int maxdims = 2048;
|
|
||||||
|
|
||||||
//check if image needs downscaling
|
//check if image needs downscaling
|
||||||
if (nx > maxdims || ny > maxdims) {
|
if (nx > maxdims || ny > maxdims) {
|
||||||
|
|
|
@ -76,7 +76,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img, const int maxdims);
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
|
@ -473,7 +473,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||||
|
|
||||||
struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
||||||
clip_image_u8 * img = clip_image_u8_init();
|
clip_image_u8 * img = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img, 2048)) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
1
expose.h
1
expose.h
|
@ -45,6 +45,7 @@ struct load_model_inputs
|
||||||
const int draft_gpulayers = 999;
|
const int draft_gpulayers = 999;
|
||||||
const float draft_gpusplit[tensor_split_max] = {};
|
const float draft_gpusplit[tensor_split_max] = {};
|
||||||
const char * mmproj_filename = nullptr;
|
const char * mmproj_filename = nullptr;
|
||||||
|
const int visionmaxres = 2048;
|
||||||
const bool use_mmap = false;
|
const bool use_mmap = false;
|
||||||
const bool use_mlock = false;
|
const bool use_mlock = false;
|
||||||
const bool use_smartcontext = false;
|
const bool use_smartcontext = false;
|
||||||
|
|
|
@ -101,6 +101,7 @@ static clip_image_u8 * clp_img_data = nullptr; //most recent image
|
||||||
static std::vector<llava_image> llava_images;
|
static std::vector<llava_image> llava_images;
|
||||||
static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
|
static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
|
||||||
static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A;
|
static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A;
|
||||||
|
static int vision_max_res = 2048;
|
||||||
|
|
||||||
static kcpp_params * kcpp_data = nullptr;
|
static kcpp_params * kcpp_data = nullptr;
|
||||||
static int max_context_limit_at_load = 0;
|
static int max_context_limit_at_load = 0;
|
||||||
|
@ -1909,6 +1910,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
|
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
|
||||||
= mpt_ctx_v3.hparams.n_ctx = kcpp_data->n_ctx;
|
= mpt_ctx_v3.hparams.n_ctx = kcpp_data->n_ctx;
|
||||||
|
|
||||||
|
vision_max_res = inputs.visionmaxres;
|
||||||
|
|
||||||
//determine rope scaling params
|
//determine rope scaling params
|
||||||
float rope_freq_scale = 1.0f;
|
float rope_freq_scale = 1.0f;
|
||||||
float rope_freq_base = 10000.0f;
|
float rope_freq_base = 10000.0f;
|
||||||
|
@ -3072,7 +3075,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
{
|
{
|
||||||
std::string llava_image = llava_images[i].b64data;
|
std::string llava_image = llava_images[i].b64data;
|
||||||
const std::vector<uint8_t> image_buffer = kcpp_base64_decode(llava_image);
|
const std::vector<uint8_t> image_buffer = kcpp_base64_decode(llava_image);
|
||||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data))
|
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data, vision_max_res))
|
||||||
{
|
{
|
||||||
//failed to load image
|
//failed to load image
|
||||||
printf("\nError: Clip image %d failed to load!",i);
|
printf("\nError: Clip image %d failed to load!",i);
|
||||||
|
|
24
koboldcpp.py
24
koboldcpp.py
|
@ -36,6 +36,8 @@ bias_min_value = -100.0
|
||||||
bias_max_value = 100.0
|
bias_max_value = 100.0
|
||||||
logprobs_max = 5
|
logprobs_max = 5
|
||||||
default_draft_amount = 8
|
default_draft_amount = 8
|
||||||
|
default_ttsmaxlen = 4096
|
||||||
|
default_visionmaxres = 1024
|
||||||
|
|
||||||
# abuse prevention
|
# abuse prevention
|
||||||
stop_token_max = 256
|
stop_token_max = 256
|
||||||
|
@ -151,6 +153,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("draft_gpulayers", ctypes.c_int),
|
("draft_gpulayers", ctypes.c_int),
|
||||||
("draft_gpusplit", ctypes.c_float * tensor_split_max),
|
("draft_gpusplit", ctypes.c_float * tensor_split_max),
|
||||||
("mmproj_filename", ctypes.c_char_p),
|
("mmproj_filename", ctypes.c_char_p),
|
||||||
|
("visionmaxres", ctypes.c_int),
|
||||||
("use_mmap", ctypes.c_bool),
|
("use_mmap", ctypes.c_bool),
|
||||||
("use_mlock", ctypes.c_bool),
|
("use_mlock", ctypes.c_bool),
|
||||||
("use_smartcontext", ctypes.c_bool),
|
("use_smartcontext", ctypes.c_bool),
|
||||||
|
@ -1057,6 +1060,7 @@ def load_model(model_filename):
|
||||||
else:
|
else:
|
||||||
inputs.draft_gpusplit[n] = 0
|
inputs.draft_gpusplit[n] = 0
|
||||||
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
|
||||||
|
inputs.visionmaxres = (512 if args.visionmaxres < 512 else (2048 if args.visionmaxres > 2048 else args.visionmaxres))
|
||||||
inputs.use_smartcontext = args.smartcontext
|
inputs.use_smartcontext = args.smartcontext
|
||||||
inputs.use_contextshift = (0 if args.noshift else 1)
|
inputs.use_contextshift = (0 if args.noshift else 1)
|
||||||
inputs.use_fastforward = (0 if args.nofastforward else 1)
|
inputs.use_fastforward = (0 if args.nofastforward else 1)
|
||||||
|
@ -3242,6 +3246,7 @@ def show_gui():
|
||||||
lora_base_var = ctk.StringVar()
|
lora_base_var = ctk.StringVar()
|
||||||
preloadstory_var = ctk.StringVar()
|
preloadstory_var = ctk.StringVar()
|
||||||
mmproj_var = ctk.StringVar()
|
mmproj_var = ctk.StringVar()
|
||||||
|
visionmaxres_var = ctk.StringVar(value=str(default_visionmaxres))
|
||||||
draftmodel_var = ctk.StringVar()
|
draftmodel_var = ctk.StringVar()
|
||||||
draftamount_var = ctk.StringVar(value=str(default_draft_amount))
|
draftamount_var = ctk.StringVar(value=str(default_draft_amount))
|
||||||
draftgpulayers_var = ctk.StringVar(value=str(999))
|
draftgpulayers_var = ctk.StringVar(value=str(999))
|
||||||
|
@ -3281,7 +3286,7 @@ def show_gui():
|
||||||
wavtokenizer_var = ctk.StringVar()
|
wavtokenizer_var = ctk.StringVar()
|
||||||
ttsgpu_var = ctk.IntVar(value=0)
|
ttsgpu_var = ctk.IntVar(value=0)
|
||||||
tts_threads_var = ctk.StringVar(value=str(default_threads))
|
tts_threads_var = ctk.StringVar(value=str(default_threads))
|
||||||
ttsmaxlen_var = ctk.StringVar(value=str(4096))
|
ttsmaxlen_var = ctk.StringVar(value=str(default_ttsmaxlen))
|
||||||
|
|
||||||
def tabbuttonaction(name):
|
def tabbuttonaction(name):
|
||||||
for t in tabcontent:
|
for t in tabcontent:
|
||||||
|
@ -3739,10 +3744,11 @@ def show_gui():
|
||||||
makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,singlerow=True,tooltiptxt="Select an optional GGML Text LoRA adapter to use.\nLeave blank to skip.")
|
makefileentry(model_tab, "Text Lora:", "Select Lora File",lora_var, 3,width=280,singlerow=True,tooltiptxt="Select an optional GGML Text LoRA adapter to use.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,singlerow=True,tooltiptxt="Select an optional F16 GGML Text LoRA base file to use.\nLeave blank to skip.")
|
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5,width=280,singlerow=True,tooltiptxt="Select an optional F16 GGML Text LoRA base file to use.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
|
||||||
makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 9,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
makelabelentry(model_tab, "Vision MaxRes:", visionmaxres_var, 9, padx=100, singleline=True, tooltip=f"Clamp MMProj vision maximum allowed resolution. Allowed values are between 512 to 2048 px (default {default_visionmaxres}).")
|
||||||
makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
|
makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 11,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
|
||||||
makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 11, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
|
makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 13, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
|
||||||
makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 11, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
|
makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 13, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
|
||||||
|
makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 13, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
|
||||||
makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
|
||||||
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
|
||||||
def pickpremadetemplate():
|
def pickpremadetemplate():
|
||||||
|
@ -4022,6 +4028,7 @@ def show_gui():
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
|
args.mmproj = None if mmproj_var.get() == "" else mmproj_var.get()
|
||||||
|
args.visionmaxres = int(visionmaxres_var.get()) if visionmaxres_var.get()!="" else default_visionmaxres
|
||||||
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
|
||||||
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
|
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
|
||||||
args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
|
args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
|
||||||
|
@ -4199,6 +4206,8 @@ def show_gui():
|
||||||
lora_var.set(dict["lora"][0])
|
lora_var.set(dict["lora"][0])
|
||||||
|
|
||||||
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
|
mmproj_var.set(dict["mmproj"] if ("mmproj" in dict and dict["mmproj"]) else "")
|
||||||
|
if "visionmaxres" in dict and dict["visionmaxres"]:
|
||||||
|
visionmaxres_var.set(dict["visionmaxres"])
|
||||||
draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")
|
draftmodel_var.set(dict["draftmodel"] if ("draftmodel" in dict and dict["draftmodel"]) else "")
|
||||||
if "draftamount" in dict:
|
if "draftamount" in dict:
|
||||||
draftamount_var.set(dict["draftamount"])
|
draftamount_var.set(dict["draftamount"])
|
||||||
|
@ -4247,7 +4256,7 @@ def show_gui():
|
||||||
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
|
||||||
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
|
||||||
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
|
||||||
ttsmaxlen_var.set(str(dict["ttsmaxlen"]) if ("ttsmaxlen" in dict and dict["ttsmaxlen"]) else str(4096))
|
ttsmaxlen_var.set(str(dict["ttsmaxlen"]) if ("ttsmaxlen" in dict and dict["ttsmaxlen"]) else str(default_ttsmaxlen))
|
||||||
|
|
||||||
importvars_in_progress = False
|
importvars_in_progress = False
|
||||||
gui_changed_modelfile()
|
gui_changed_modelfile()
|
||||||
|
@ -5604,6 +5613,7 @@ if __name__ == '__main__':
|
||||||
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
|
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
|
||||||
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
|
||||||
advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
|
||||||
|
advparser.add_argument("--visionmaxres", metavar=('[max px]'), help="Clamp MMProj vision maximum allowed resolution. Allowed values are between 512 to 2048 px (default 1024).", type=int, default=default_visionmaxres)
|
||||||
advparser.add_argument("--draftmodel", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
advparser.add_argument("--draftmodel", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
|
||||||
advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
|
advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
|
||||||
advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
|
advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
|
||||||
|
@ -5652,7 +5662,7 @@ if __name__ == '__main__':
|
||||||
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
|
||||||
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
|
||||||
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=4096)
|
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)
|
||||||
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
|
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
|
||||||
|
|
||||||
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue