now accept multiple images for reference images

2025-09-15 03:19:41 +00:00 · 2025-06-28 17:30:28 +08:00 · 2025-06-28 17:30:28 +08:00 · 4ec0e0fd21
commit 4ec0e0fd21
parent 2e14338455
6 changed files with 128 additions and 76 deletions
--- a/expose.h
+++ b/expose.h
@ -181,7 +181,8 @@ struct sd_generation_inputs
    const char * negative_prompt = nullptr;
    const char * init_images = "";
    const char * mask = "";
-    const char * extra_image = "";
+    const int extra_images_len = 0;
+    const char ** extra_images = nullptr;
    const bool flip_mask = false;
    const float denoising_strength = 0.0f;
    const float cfg_scale = 0.0f;
--- a/kcpp_sdui.embd
+++ b/kcpp_sdui.embd
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -59,6 +59,7 @@ stop_token_max = 256
 ban_token_max = 768
 logit_bias_max = 512
 dry_seq_break_max = 128
+extra_images_max = 4

 # global vars
 KcppVersion = "1.94.2"
@ -291,7 +292,8 @@ class sd_generation_inputs(ctypes.Structure):
                ("negative_prompt", ctypes.c_char_p),
                ("init_images", ctypes.c_char_p),
                ("mask", ctypes.c_char_p),
-                ("extra_image", ctypes.c_char_p),
+                ("extra_images_len", ctypes.c_int),
+                ("extra_images", ctypes.POINTER(ctypes.c_char_p)),
                ("flip_mask", ctypes.c_bool),
                ("denoising_strength", ctypes.c_float),
                ("cfg_scale", ctypes.c_float),
@ -1714,7 +1716,9 @@ def sd_generate(genparams):
        seed = random.randint(100000, 999999)
    sample_method = genparams.get("sampler_name", "k_euler_a")
    clip_skip = tryparseint(genparams.get("clip_skip", -1),-1)
-    extra_image = strip_base64_prefix(genparams.get("extra_image", ""))
+    extra_images_arr = genparams.get("extra_images", [])
+    extra_images_arr = ([] if not extra_images_arr else extra_images_arr)
+    extra_images_arr = extra_images_arr[:extra_images_max]

    #clean vars
    cfg_scale = (1 if cfg_scale < 1 else (25 if cfg_scale > 25 else cfg_scale))
@ -1728,7 +1732,11 @@ def sd_generate(genparams):
    inputs.negative_prompt = negative_prompt.encode("UTF-8")
    inputs.init_images = init_images.encode("UTF-8")
    inputs.mask = "".encode("UTF-8") if not mask else mask.encode("UTF-8")
-    inputs.extra_image = "".encode("UTF-8") if not extra_image else extra_image.encode("UTF-8")
+    inputs.extra_images_len = len(extra_images_arr)
+    inputs.extra_images = (ctypes.c_char_p * inputs.extra_images_len)()
+    for n, estr in enumerate(extra_images_arr):
+        extra_image = strip_base64_prefix(estr)
+        inputs.extra_images[n] = extra_image.encode("UTF-8")
    inputs.flip_mask = flip_mask
    inputs.cfg_scale = cfg_scale
    inputs.denoising_strength = denoising_strength
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -116,7 +116,8 @@ static int sddebugmode = 0;
 static std::string recent_data = "";
 static uint8_t * input_image_buffer = NULL;
 static uint8_t * input_mask_buffer = NULL;
-static uint8_t * input_extraimage_buffer = NULL;
+static std::vector<uint8_t *> input_extraimage_buffers;
+const int max_extra_images = 4;

 static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
 static int cfg_tiled_vae_threshold = 0;
@ -288,8 +289,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
        sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier);
    }

-    return true;
+    input_extraimage_buffers.reserve(max_extra_images);

+    return true;
 }

 std::string clean_input_prompt(const std::string& input) {
@ -434,7 +436,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt);
    std::string img2img_data = std::string(inputs.init_images);
    std::string img2img_mask = std::string(inputs.mask);
-    std::string extra_image_data = std::string(inputs.extra_image);
+    std::vector<std::string> extra_image_data;
+    for(int i=0;i<inputs.extra_images_len;++i)
+    {
+        extra_image_data.push_back(std::string(inputs.extra_images[i]));
+    }
+
    std::string sampler = inputs.sample_method;

    sd_params->prompt = cleanprompt;
@ -503,17 +510,20 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)

    //for img2img
    sd_image_t input_image = {0,0,0,nullptr};
-    sd_image_t extraimage_reference = {0,0,0,nullptr};
+    std::vector<sd_image_t> extraimage_references;
+    extraimage_references.reserve(max_extra_images);
    std::vector<uint8_t> image_buffer;
    std::vector<uint8_t> image_mask_buffer;
-    std::vector<uint8_t> extraimage_buffer;
+    std::vector<std::vector<uint8_t>> extraimage_buffers;
+    extraimage_buffers.reserve(max_extra_images);
+
    int nx, ny, nc;
    int img2imgW = sd_params->width; //for img2img input
    int img2imgH = sd_params->height;
    int img2imgC = 3; // Assuming RGB image
    std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC);
    std::vector<uint8_t> resized_mask_buf(img2imgW * img2imgH * img2imgC);
-    std::vector<uint8_t> resized_extraimage_buf(img2imgW * img2imgH * img2imgC);
+    std::vector<std::vector<uint8_t>> resized_extraimage_bufs(max_extra_images, std::vector<uint8_t>(img2imgW * img2imgH * img2imgC));

    std::string ts = get_timestamp_str();
    if(!sd_is_quiet)
@ -558,29 +568,39 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
        sd_params->sample_method = sample_method_t::EULER_A;
    }

-    if(extra_image_data!="")
+    if(extra_image_data.size()>0)
    {
-        if(input_extraimage_buffer!=nullptr) //just in time free old buffer
+        if(input_extraimage_buffers.size()>0) //just in time free old buffer
        {
-            stbi_image_free(input_extraimage_buffer);
-            input_extraimage_buffer = nullptr;
+            for(int i=0;i<input_extraimage_buffers.size();++i)
+            {
+                stbi_image_free(input_extraimage_buffers[i]);
            }
+            input_extraimage_buffers.clear();
+        }
+        extraimage_buffers.clear();
+        extraimage_references.clear();
+        for(int i=0;i<extra_image_data.size() && i<max_extra_images;++i)
+        {
            int nx2, ny2, nc2;
            int desiredchannels = 3;
-        extraimage_buffer = kcpp_base64_decode(extra_image_data);
-        input_extraimage_buffer = stbi_load_from_memory(extraimage_buffer.data(), extraimage_buffer.size(), &nx2, &ny2, &nc2, desiredchannels);
+            extraimage_buffers.push_back(kcpp_base64_decode(extra_image_data[i]));
+            input_extraimage_buffers.push_back(stbi_load_from_memory(extraimage_buffers[i].data(), extraimage_buffers[i].size(), &nx2, &ny2, &nc2, desiredchannels));
            // Resize the image
-        int resok = stbir_resize_uint8(input_extraimage_buffer, nx2, ny2, 0, resized_extraimage_buf.data(), img2imgW, img2imgH, 0, desiredchannels);
+            int resok = stbir_resize_uint8(input_extraimage_buffers[i], nx2, ny2, 0, resized_extraimage_bufs[i].data(), img2imgW, img2imgH, 0, desiredchannels);
            if (!resok) {
                printf("\nKCPP SD: resize extra image failed!\n");
                output.data = "";
                output.status = 0;
                return output;
            }
+            sd_image_t extraimage_reference;
            extraimage_reference.width = img2imgW;
            extraimage_reference.height = img2imgH;
            extraimage_reference.channel = desiredchannels;
-        extraimage_reference.data = resized_extraimage_buf.data();
+            extraimage_reference.data = resized_extraimage_bufs[i].data();
+            extraimage_references.push_back(extraimage_reference);
+        }

        //ensure prompt has img keyword, otherwise append it
        if(photomaker_enabled)
@ -595,9 +615,29 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    }

    std::vector<sd_image_t> kontext_imgs;
-    if(extra_image_data!="" && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma())
+    if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma())
    {
-        kontext_imgs.push_back(extraimage_reference);
+        for(int i=0;i<extra_image_data.size();++i)
+        {
+            kontext_imgs.push_back(extraimage_references[i]);
+        }
+        if(!sd_is_quiet && sddebugmode==1)
+        {
+            printf("\nFlux Kontext: Using %d reference images\n",kontext_imgs.size());
+        }
+    }
+
+    std::vector<sd_image_t*> photomaker_imgs;
+    if(photomaker_enabled && extra_image_data.size()>0)
+    {
+        for(int i=0;i<extra_image_data.size();++i)
+        {
+            photomaker_imgs.push_back(&extraimage_references[i]);
+        }
+        if(!sd_is_quiet && sddebugmode==1)
+        {
+            printf("\nPhotomaker: Using %d reference images\n",photomaker_imgs.size());
+        }
    }

    if (sd_params->mode == TXT2IMG) {
@ -644,7 +684,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                          sd_params->slg_scale,
                          sd_params->skip_layer_start,
                          sd_params->skip_layer_end,
-                          (photomaker_enabled && extra_image_data!=""?(&extraimage_reference):nullptr));
+                          photomaker_imgs);
    } else {

        if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@ -769,7 +809,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                            sd_params->slg_scale,
                            sd_params->skip_layer_start,
                            sd_params->skip_layer_end,
-                            (photomaker_enabled && extra_image_data!=""?(&extraimage_reference):nullptr));
+                            photomaker_imgs);
    }

    if (results == NULL) {
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -1422,7 +1422,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                           float skip_layer_start                        = 0.01,
                           float skip_layer_end                          = 0.2,
                           ggml_tensor* masked_image                     = NULL,
-                           const sd_image_t* photomaker_reference = nullptr) {
+                           const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
    if (seed < 0) {
        // Generally, when using the provided command line, the seed is always >0.
        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@ -1465,7 +1465,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    ggml_tensor* init_img = NULL;
    SDCondition id_cond;
    std::vector<bool> class_tokens_mask;
-    if (sd_ctx->sd->pmid_model && photomaker_reference!=nullptr)
+    if (sd_ctx->sd->pmid_model && photomaker_references.size()>0)
    {
        sd_ctx->sd->stacked_id = true; //turn on photomaker if needed
    }
@ -1512,15 +1512,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
            }
        }

-        // handle single photomaker image passed in by kcpp
-        if (sd_ctx->sd->pmid_model && photomaker_reference!=nullptr)
+        // handle multiple photomaker image passed in by kcpp
+        if (sd_ctx->sd->pmid_model && photomaker_references.size()>0)
+        {
+            for(int i=0;i<photomaker_references.size();++i)
            {
                int c = 0;
                int width, height;
-            width = photomaker_reference->width;
-            height = photomaker_reference->height;
-            c = photomaker_reference->channel;
-            uint8_t* input_image_buffer = photomaker_reference->data;
+                width = photomaker_references[i]->width;
+                height = photomaker_references[i]->height;
+                c = photomaker_references[i]->channel;
+                uint8_t* input_image_buffer = photomaker_references[i]->data;
                sd_image_t* input_image = NULL;
                input_image  = new sd_image_t{(uint32_t)width,
                                                (uint32_t)height,
@ -1534,6 +1536,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                    input_id_images.push_back(input_image);
                }
            }
+        }

        if (input_id_images.size() > 0) {
            sd_ctx->sd->pmid_model->style_strength = style_ratio;
@ -1790,7 +1793,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                    float slg_scale          = 0,
                    float skip_layer_start   = 0.01,
                    float skip_layer_end     = 0.2,
-                    const sd_image_t* photomaker_reference = nullptr) {
+                    const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
    LOG_DEBUG("txt2img %dx%d", width, height);
    if (sd_ctx == NULL) {
@ -1887,7 +1890,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                               skip_layer_start,
                                               skip_layer_end,
                                               nullptr,
-                                               photomaker_reference);
+                                               photomaker_references);

    size_t t1 = ggml_time_ms();

@ -1924,7 +1927,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                    float slg_scale          = 0,
                    float skip_layer_start   = 0.01,
                    float skip_layer_end     = 0.2,
-                    const sd_image_t* photomaker_reference = nullptr) {
+                    const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
    LOG_DEBUG("img2img %dx%d", width, height);
    if (sd_ctx == NULL) {
@ -2089,7 +2092,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                               skip_layer_start,
                                               skip_layer_end,
                                               masked_image,
-                                               photomaker_reference);
+                                               photomaker_references);

    size_t t2 = ggml_time_ms();

--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -183,7 +183,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           float slg_scale,
                           float skip_layer_start,
                           float skip_layer_end,
-                           const sd_image_t* photomaker_reference);
+                           const std::vector<sd_image_t*> photomaker_references);

 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
@ -213,7 +213,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           float slg_scale,
                           float skip_layer_start,
                           float skip_layer_end,
-                           const sd_image_t* photomaker_reference);
+                           const std::vector<sd_image_t*> photomaker_references);

 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,