added photomaker face cloning

This commit is contained in:
Concedo 2025-06-20 21:33:36 +08:00
parent 21881a861d
commit 4e40f2aaf4
6 changed files with 161 additions and 57 deletions

View file

@ -169,6 +169,7 @@ struct sd_load_model_inputs
const char * vae_filename = nullptr; const char * vae_filename = nullptr;
const char * lora_filename = nullptr; const char * lora_filename = nullptr;
const float lora_multiplier = 1.0f; const float lora_multiplier = 1.0f;
const char * photomaker_filename = nullptr;
const int img_hard_limit = 0; const int img_hard_limit = 0;
const int img_soft_limit = 0; const int img_soft_limit = 0;
const bool quiet = false; const bool quiet = false;
@ -180,6 +181,7 @@ struct sd_generation_inputs
const char * negative_prompt = nullptr; const char * negative_prompt = nullptr;
const char * init_images = ""; const char * init_images = "";
const char * mask = ""; const char * mask = "";
const char * photomaker_image = "";
const bool flip_mask = false; const bool flip_mask = false;
const float denoising_strength = 0.0f; const float denoising_strength = 0.0f;
const float cfg_scale = 0.0f; const float cfg_scale = 0.0f;

File diff suppressed because one or more lines are too long

View file

@ -279,6 +279,7 @@ class sd_load_model_inputs(ctypes.Structure):
("vae_filename", ctypes.c_char_p), ("vae_filename", ctypes.c_char_p),
("lora_filename", ctypes.c_char_p), ("lora_filename", ctypes.c_char_p),
("lora_multiplier", ctypes.c_float), ("lora_multiplier", ctypes.c_float),
("photomaker_filename", ctypes.c_char_p),
("img_hard_limit", ctypes.c_int), ("img_hard_limit", ctypes.c_int),
("img_soft_limit", ctypes.c_int), ("img_soft_limit", ctypes.c_int),
("quiet", ctypes.c_bool), ("quiet", ctypes.c_bool),
@ -289,6 +290,7 @@ class sd_generation_inputs(ctypes.Structure):
("negative_prompt", ctypes.c_char_p), ("negative_prompt", ctypes.c_char_p),
("init_images", ctypes.c_char_p), ("init_images", ctypes.c_char_p),
("mask", ctypes.c_char_p), ("mask", ctypes.c_char_p),
("photomaker_image", ctypes.c_char_p),
("flip_mask", ctypes.c_bool), ("flip_mask", ctypes.c_bool),
("denoising_strength", ctypes.c_float), ("denoising_strength", ctypes.c_float),
("cfg_scale", ctypes.c_float), ("cfg_scale", ctypes.c_float),
@ -657,6 +659,13 @@ def is_incomplete_utf8_sequence(byte_seq): #note, this will only flag INCOMPLETE
return True #incomplete sequence return True #incomplete sequence
return False #invalid sequence, but not incomplete return False #invalid sequence, but not incomplete
def strip_base64_prefix(encoded_data):
if not encoded_data:
return ""
if encoded_data.startswith("data:image"):
encoded_data = encoded_data.split(',', 1)[-1]
return encoded_data
def unpack_to_dir(destpath = ""): def unpack_to_dir(destpath = ""):
srcpath = os.path.abspath(os.path.dirname(__file__)) srcpath = os.path.abspath(os.path.dirname(__file__))
cliunpack = False if destpath == "" else True cliunpack = False if destpath == "" else True
@ -1523,7 +1532,7 @@ def generate(genparams, stream_flag=False):
return {"text":outstr,"status":ret.status,"stopreason":ret.stopreason,"prompt_tokens":ret.prompt_tokens, "completion_tokens": ret.completion_tokens} return {"text":outstr,"status":ret.status,"stopreason":ret.stopreason,"prompt_tokens":ret.prompt_tokens, "completion_tokens": ret.completion_tokens}
def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename): def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
global args global args
inputs = sd_load_model_inputs() inputs = sd_load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8") inputs.model_filename = model_filename.encode("UTF-8")
@ -1547,6 +1556,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8") inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8")
inputs.clipl_filename = clipl_filename.encode("UTF-8") inputs.clipl_filename = clipl_filename.encode("UTF-8")
inputs.clipg_filename = clipg_filename.encode("UTF-8") inputs.clipg_filename = clipg_filename.encode("UTF-8")
inputs.photomaker_filename = photomaker_filename.encode("UTF-8")
inputs.img_hard_limit = args.sdclamped inputs.img_hard_limit = args.sdclamped
inputs.img_soft_limit = args.sdclampedsoft inputs.img_soft_limit = args.sdclampedsoft
inputs = set_backend_props(inputs) inputs = set_backend_props(inputs)
@ -1617,7 +1627,8 @@ def sd_generate(genparams):
prompt = forced_posprompt prompt = forced_posprompt
init_images_arr = genparams.get("init_images", []) init_images_arr = genparams.get("init_images", [])
init_images = ("" if (not init_images_arr or len(init_images_arr)==0 or not init_images_arr[0]) else init_images_arr[0]) init_images = ("" if (not init_images_arr or len(init_images_arr)==0 or not init_images_arr[0]) else init_images_arr[0])
mask = genparams.get("mask", "") init_images = strip_base64_prefix(init_images)
mask = strip_base64_prefix(genparams.get("mask", ""))
flip_mask = genparams.get("inpainting_mask_invert", 0) flip_mask = genparams.get("inpainting_mask_invert", 0)
denoising_strength = tryparsefloat(genparams.get("denoising_strength", 0.6),0.6) denoising_strength = tryparsefloat(genparams.get("denoising_strength", 0.6),0.6)
cfg_scale = tryparsefloat(genparams.get("cfg_scale", 5),5) cfg_scale = tryparsefloat(genparams.get("cfg_scale", 5),5)
@ -1629,6 +1640,7 @@ def sd_generate(genparams):
seed = random.randint(100000, 999999) seed = random.randint(100000, 999999)
sample_method = genparams.get("sampler_name", "k_euler_a") sample_method = genparams.get("sampler_name", "k_euler_a")
clip_skip = tryparseint(genparams.get("clip_skip", -1),-1) clip_skip = tryparseint(genparams.get("clip_skip", -1),-1)
photomaker_image = strip_base64_prefix(genparams.get("photomaker_image", ""))
#clean vars #clean vars
cfg_scale = (1 if cfg_scale < 1 else (25 if cfg_scale > 25 else cfg_scale)) cfg_scale = (1 if cfg_scale < 1 else (25 if cfg_scale > 25 else cfg_scale))
@ -1642,6 +1654,7 @@ def sd_generate(genparams):
inputs.negative_prompt = negative_prompt.encode("UTF-8") inputs.negative_prompt = negative_prompt.encode("UTF-8")
inputs.init_images = init_images.encode("UTF-8") inputs.init_images = init_images.encode("UTF-8")
inputs.mask = "".encode("UTF-8") if not mask else mask.encode("UTF-8") inputs.mask = "".encode("UTF-8") if not mask else mask.encode("UTF-8")
inputs.photomaker_image = "".encode("UTF-8") if not photomaker_image else photomaker_image.encode("UTF-8")
inputs.flip_mask = flip_mask inputs.flip_mask = flip_mask
inputs.cfg_scale = cfg_scale inputs.cfg_scale = cfg_scale
inputs.denoising_strength = denoising_strength inputs.denoising_strength = denoising_strength
@ -4288,6 +4301,7 @@ def show_gui():
sd_t5xxl_var = ctk.StringVar() sd_t5xxl_var = ctk.StringVar()
sd_clipl_var = ctk.StringVar() sd_clipl_var = ctk.StringVar()
sd_clipg_var = ctk.StringVar() sd_clipg_var = ctk.StringVar()
sd_photomaker_var = ctk.StringVar()
sd_vaeauto_var = ctk.IntVar(value=0) sd_vaeauto_var = ctk.IntVar(value=0)
sd_notile_var = ctk.IntVar(value=0) sd_notile_var = ctk.IntVar(value=0)
sd_clamped_var = ctk.StringVar(value="0") sd_clamped_var = ctk.StringVar(value="0")
@ -5002,13 +5016,12 @@ def show_gui():
makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!") makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
makelabelentry(images_tab, "Image LoRA Multiplier:" , sd_loramult_var, 22, 50,padx=290,singleline=True,tooltip="What mutiplier value to apply the SD LoRA with.") makelabelentry(images_tab, "Image LoRA Multiplier:" , sd_loramult_var, 22, 50,padx=290,singleline=True,tooltip="What mutiplier value to apply the SD LoRA with.")
makefileentry(images_tab, "T5-XXL File:", "Select Optional T5-XXL model file (SD3 or flux)",sd_t5xxl_var, 24, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.") makefileentry(images_tab, "T5-XXL File:", "Select Optional T5-XXL model file (SD3 or flux)",sd_t5xxl_var, 24, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
makefileentry(images_tab, "Clip-L File:", "Select Optional Clip-L model file (SD3 or flux)",sd_clipl_var, 26, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.") makefileentry(images_tab, "Clip-L File:", "Select Optional Clip-L model file (SD3 or flux)",sd_clipl_var, 26, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
makefileentry(images_tab, "Clip-G File:", "Select Optional Clip-G model file (SD3)",sd_clipg_var, 28, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.") makefileentry(images_tab, "Clip-G File:", "Select Optional Clip-G model file (SD3)",sd_clipg_var, 28, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="Select a .safetensors t5xxl file to be loaded.")
makefileentry(images_tab, "PhotoMaker:", "Select Optional PhotoMaker model file (SDXL)",sd_photomaker_var, 30, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")],tooltiptxt="PhotoMaker is a model that allows face cloning.\nSelect a .safetensors PhotoMaker file to be loaded (SDXL only).")
sdvaeitem1,sdvaeitem2,sdvaeitem3 = makefileentry(images_tab, "Image VAE:", "Select Optional SD VAE file",sd_vae_var, 30, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD VAE file to be loaded.") sdvaeitem1,sdvaeitem2,sdvaeitem3 = makefileentry(images_tab, "Image VAE:", "Select Optional SD VAE file",sd_vae_var, 40, width=280, singlerow=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD VAE file to be loaded.")
def toggletaesd(a,b,c): def toggletaesd(a,b,c):
if sd_vaeauto_var.get()==1: if sd_vaeauto_var.get()==1:
sdvaeitem1.grid_remove() sdvaeitem1.grid_remove()
@ -5019,8 +5032,8 @@ def show_gui():
sdvaeitem1.grid() sdvaeitem1.grid()
sdvaeitem2.grid() sdvaeitem2.grid()
sdvaeitem3.grid() sdvaeitem3.grid()
makecheckbox(images_tab, "Use TAE SD (AutoFix Broken VAE)", sd_vaeauto_var, 32,command=toggletaesd,tooltiptxt="Replace VAE with TAESD. May fix bad VAE.") makecheckbox(images_tab, "Use TAE SD (AutoFix Broken VAE)", sd_vaeauto_var, 42,command=toggletaesd,tooltiptxt="Replace VAE with TAESD. May fix bad VAE.")
makecheckbox(images_tab, "No VAE Tiling", sd_notile_var, 34,tooltiptxt="Disables VAE tiling, may not work for large images.") makecheckbox(images_tab, "No VAE Tiling", sd_notile_var, 44,tooltiptxt="Disables VAE tiling, may not work for large images.")
# audio tab # audio tab
audio_tab = tabcontent["Audio"] audio_tab = tabcontent["Audio"]
@ -5268,6 +5281,8 @@ def show_gui():
args.sdclipl = sd_clipl_var.get() args.sdclipl = sd_clipl_var.get()
if sd_clipg_var.get() != "": if sd_clipg_var.get() != "":
args.sdclipg = sd_clipg_var.get() args.sdclipg = sd_clipg_var.get()
if sd_photomaker_var.get() != "":
args.sdphotomaker = sd_photomaker_var.get()
if sd_quant_var.get()==1: if sd_quant_var.get()==1:
args.sdquant = True args.sdquant = True
if sd_lora_var.get() != "": if sd_lora_var.get() != "":
@ -5471,6 +5486,7 @@ def show_gui():
sd_t5xxl_var.set(dict["sdt5xxl"] if ("sdt5xxl" in dict and dict["sdt5xxl"]) else "") sd_t5xxl_var.set(dict["sdt5xxl"] if ("sdt5xxl" in dict and dict["sdt5xxl"]) else "")
sd_clipl_var.set(dict["sdclipl"] if ("sdclipl" in dict and dict["sdclipl"]) else "") sd_clipl_var.set(dict["sdclipl"] if ("sdclipl" in dict and dict["sdclipl"]) else "")
sd_clipg_var.set(dict["sdclipg"] if ("sdclipg" in dict and dict["sdclipg"]) else "") sd_clipg_var.set(dict["sdclipg"] if ("sdclipg" in dict and dict["sdclipg"]) else "")
sd_photomaker_var.set(dict["sdphotomaker"] if ("sdphotomaker" in dict and dict["sdphotomaker"]) else "")
sd_vaeauto_var.set(1 if ("sdvaeauto" in dict and dict["sdvaeauto"]) else 0) sd_vaeauto_var.set(1 if ("sdvaeauto" in dict and dict["sdvaeauto"]) else 0)
sd_notile_var.set(1 if ("sdnotile" in dict and dict["sdnotile"]) else 0) sd_notile_var.set(1 if ("sdnotile" in dict and dict["sdnotile"]) else 0)
sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "") sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "")
@ -6509,6 +6525,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
dlfile = download_model_from_url(args.sdclipg,[".gguf",".safetensors"],min_file_size=500000) dlfile = download_model_from_url(args.sdclipg,[".gguf",".safetensors"],min_file_size=500000)
if dlfile: if dlfile:
args.sdclipg = dlfile args.sdclipg = dlfile
if args.sdphotomaker and args.sdphotomaker!="":
dlfile = download_model_from_url(args.sdphotomaker,[".gguf",".safetensors"],min_file_size=500000)
if dlfile:
args.sdphotomaker = dlfile
if args.sdvae and args.sdvae!="": if args.sdvae and args.sdvae!="":
dlfile = download_model_from_url(args.sdvae,[".gguf",".safetensors"],min_file_size=500000) dlfile = download_model_from_url(args.sdvae,[".gguf",".safetensors"],min_file_size=500000)
if dlfile: if dlfile:
@ -6785,6 +6805,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
imgt5xxl = "" imgt5xxl = ""
imgclipl = "" imgclipl = ""
imgclipg = "" imgclipg = ""
imgphotomaker = ""
if args.sdlora: if args.sdlora:
if os.path.exists(args.sdlora): if os.path.exists(args.sdlora):
imglora = os.path.abspath(args.sdlora) imglora = os.path.abspath(args.sdlora)
@ -6810,13 +6831,18 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
imgclipg = os.path.abspath(args.sdclipg) imgclipg = os.path.abspath(args.sdclipg)
else: else:
print("Missing SD Clip-G model file...") print("Missing SD Clip-G model file...")
if args.sdphotomaker:
if os.path.exists(args.sdphotomaker):
imgphotomaker = os.path.abspath(args.sdphotomaker)
else:
print("Missing SD Photomaker model file...")
imgmodel = os.path.abspath(imgmodel) imgmodel = os.path.abspath(imgmodel)
fullsdmodelpath = imgmodel fullsdmodelpath = imgmodel
friendlysdmodelname = os.path.basename(imgmodel) friendlysdmodelname = os.path.basename(imgmodel)
friendlysdmodelname = os.path.splitext(friendlysdmodelname)[0] friendlysdmodelname = os.path.splitext(friendlysdmodelname)[0]
friendlysdmodelname = sanitize_string(friendlysdmodelname) friendlysdmodelname = sanitize_string(friendlysdmodelname)
loadok = sd_load_model(imgmodel,imgvae,imglora,imgt5xxl,imgclipl,imgclipg) loadok = sd_load_model(imgmodel,imgvae,imglora,imgt5xxl,imgclipl,imgclipg,imgphotomaker)
print("Load Image Model OK: " + str(loadok)) print("Load Image Model OK: " + str(loadok))
if not loadok: if not loadok:
exitcounter = 999 exitcounter = 999
@ -7235,6 +7261,7 @@ if __name__ == '__main__':
sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="")
sdparsergroup.add_argument("--sdphotomaker", metavar=('[filename]'), help="PhotoMaker is a model that allows face cloning. Specify a PhotoMaker safetensors model which will be applied replacing img2img. SDXL models only. Leave blank if unused.", default="")
sdparsergroupvae = sdparsergroup.add_mutually_exclusive_group() sdparsergroupvae = sdparsergroup.add_mutually_exclusive_group()
sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="") sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true') sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')

View file

@ -597,6 +597,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
tokens.erase(it); tokens.erase(it);
return decode(tokens); return decode(tokens);
//return prompt; //kcpp we don't care about photomaker trigger words
} }
SDCondition get_learned_condition(ggml_context* work_ctx, SDCondition get_learned_condition(ggml_context* work_ctx,

View file

@ -57,7 +57,7 @@ struct SDParams {
std::string controlnet_path; std::string controlnet_path;
std::string embeddings_path; std::string embeddings_path;
std::string stacked_id_embeddings_path; std::string stacked_id_embeddings_path;
std::string input_id_images_path; std::string input_id_images_path = "";
sd_type_t wtype = SD_TYPE_COUNT; sd_type_t wtype = SD_TYPE_COUNT;
std::string lora_model_dir; std::string lora_model_dir;
std::string output_path = "output.png"; std::string output_path = "output.png";
@ -116,6 +116,7 @@ static int sddebugmode = 0;
static std::string recent_data = ""; static std::string recent_data = "";
static uint8_t * input_image_buffer = NULL; static uint8_t * input_image_buffer = NULL;
static uint8_t * input_mask_buffer = NULL; static uint8_t * input_mask_buffer = NULL;
static uint8_t * input_photomaker_buffer = NULL;
static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv; static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
static bool notiling = false; static bool notiling = false;
@ -134,6 +135,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
std::string t5xxl_filename = inputs.t5xxl_filename; std::string t5xxl_filename = inputs.t5xxl_filename;
std::string clipl_filename = inputs.clipl_filename; std::string clipl_filename = inputs.clipl_filename;
std::string clipg_filename = inputs.clipg_filename; std::string clipg_filename = inputs.clipg_filename;
std::string photomaker_filename = inputs.photomaker_filename;
notiling = inputs.notile; notiling = inputs.notile;
cfg_side_limit = inputs.img_hard_limit; cfg_side_limit = inputs.img_hard_limit;
cfg_square_limit = inputs.img_soft_limit; cfg_square_limit = inputs.img_soft_limit;
@ -164,6 +166,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
{ {
printf("With Custom Clip-G Model: %s\n",clipg_filename.c_str()); printf("With Custom Clip-G Model: %s\n",clipg_filename.c_str());
} }
if(photomaker_filename!="")
{
printf("With PhotoMaker Model: %s\n",photomaker_filename.c_str());
}
if(inputs.quant) if(inputs.quant)
{ {
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n"); printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@ -205,6 +211,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
sd_params->t5xxl_path = t5xxl_filename; sd_params->t5xxl_path = t5xxl_filename;
sd_params->clip_l_path = clipl_filename; sd_params->clip_l_path = clipl_filename;
sd_params->clip_g_path = clipg_filename; sd_params->clip_g_path = clipg_filename;
sd_params->stacked_id_embeddings_path = photomaker_filename;
//if t5 is set, and model is a gguf, load it as a diffusion model path //if t5 is set, and model is a gguf, load it as a diffusion model path
bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5); bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5);
if(sd_params->t5xxl_path!="" && endswithgguf) if(sd_params->t5xxl_path!="" && endswithgguf)
@ -423,6 +430,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt); std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt);
std::string img2img_data = std::string(inputs.init_images); std::string img2img_data = std::string(inputs.init_images);
std::string img2img_mask = std::string(inputs.mask); std::string img2img_mask = std::string(inputs.mask);
std::string photomaker_image_data = std::string(inputs.photomaker_image);
std::string sampler = inputs.sample_method; std::string sampler = inputs.sample_method;
sd_params->prompt = cleanprompt; sd_params->prompt = cleanprompt;
@ -490,15 +498,17 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
//for img2img //for img2img
sd_image_t input_image = {0,0,0,nullptr}; sd_image_t input_image = {0,0,0,nullptr};
sd_image_t photomaker_reference = {0,0,0,nullptr};
std::vector<uint8_t> image_buffer; std::vector<uint8_t> image_buffer;
std::vector<uint8_t> image_mask_buffer; std::vector<uint8_t> image_mask_buffer;
std::vector<uint8_t> photomaker_buffer;
int nx, ny, nc; int nx, ny, nc;
int nx2, ny2, nc2;
int img2imgW = sd_params->width; //for img2img input int img2imgW = sd_params->width; //for img2img input
int img2imgH = sd_params->height; int img2imgH = sd_params->height;
int img2imgC = 3; // Assuming RGB image int img2imgC = 3; // Assuming RGB image
std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC); std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC);
std::vector<uint8_t> resized_mask_buf(img2imgW * img2imgH * img2imgC); std::vector<uint8_t> resized_mask_buf(img2imgW * img2imgH * img2imgC);
std::vector<uint8_t> resized_photomaker_buf(img2imgW * img2imgH * img2imgC);
std::string ts = get_timestamp_str(); std::string ts = get_timestamp_str();
if(!sd_is_quiet) if(!sd_is_quiet)
@ -543,6 +553,35 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
sd_params->sample_method = sample_method_t::EULER_A; sd_params->sample_method = sample_method_t::EULER_A;
} }
if(photomaker_image_data!="")
{
if(input_photomaker_buffer!=nullptr) //just in time free old buffer
{
stbi_image_free(input_photomaker_buffer);
input_photomaker_buffer = nullptr;
}
int nx2, ny2, nc2;
photomaker_buffer = kcpp_base64_decode(photomaker_image_data);
input_photomaker_buffer = stbi_load_from_memory(photomaker_buffer.data(), photomaker_buffer.size(), &nx2, &ny2, &nc2, 1);
// Resize the image
int resok = stbir_resize_uint8(input_photomaker_buffer, nx2, ny2, 0, resized_photomaker_buf.data(), img2imgW, img2imgH, 0, 1);
if (!resok) {
printf("\nKCPP SD: resize photomaker image failed!\n");
output.data = "";
output.status = 0;
return output;
}
photomaker_reference.width = img2imgW;
photomaker_reference.height = img2imgH;
photomaker_reference.channel = img2imgC;
photomaker_reference.data = resized_photomaker_buf.data();
//ensure prompt has img keyword, otherwise append it
if (sd_params->prompt.find("img") == std::string::npos) {
sd_params->prompt += " img";
}
}
if (sd_params->mode == TXT2IMG) { if (sd_params->mode == TXT2IMG) {
if(!sd_is_quiet && sddebugmode==1) if(!sd_is_quiet && sddebugmode==1)
@ -585,7 +624,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
sd_params->skip_layers.size(), sd_params->skip_layers.size(),
sd_params->slg_scale, sd_params->slg_scale,
sd_params->skip_layer_start, sd_params->skip_layer_start,
sd_params->skip_layer_end); sd_params->skip_layer_end,
(photomaker_image_data!=""?&photomaker_reference:nullptr));
} else { } else {
if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) { if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@ -596,18 +636,11 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
} }
image_buffer = kcpp_base64_decode(img2img_data); image_buffer = kcpp_base64_decode(img2img_data);
if(input_image_buffer!=nullptr) //just in time free old buffer if(input_image_buffer!=nullptr) //just in time free old buffer
{ {
stbi_image_free(input_image_buffer); stbi_image_free(input_image_buffer);
input_image_buffer = nullptr; input_image_buffer = nullptr;
} }
if(input_mask_buffer!=nullptr) //just in time free old buffer
{
stbi_image_free(input_mask_buffer);
input_mask_buffer = nullptr;
}
input_image_buffer = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &nx, &ny, &nc, 3); input_image_buffer = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &nx, &ny, &nc, 3);
if (nx < 64 || ny < 64 || nx > 1024 || ny > 1024 || nc!= 3) { if (nx < 64 || ny < 64 || nx > 1024 || ny > 1024 || nc!= 3) {
@ -634,6 +667,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
if(img2img_mask!="") if(img2img_mask!="")
{ {
int nx2, ny2, nc2;
if(input_mask_buffer!=nullptr) //just in time free old buffer
{
stbi_image_free(input_mask_buffer);
input_mask_buffer = nullptr;
}
image_mask_buffer = kcpp_base64_decode(img2img_mask); image_mask_buffer = kcpp_base64_decode(img2img_mask);
input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 1); input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 1);
// Resize the image // Resize the image
@ -709,7 +748,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
sd_params->skip_layers.size(), sd_params->skip_layers.size(),
sd_params->slg_scale, sd_params->slg_scale,
sd_params->skip_layer_start, sd_params->skip_layer_start,
sd_params->skip_layer_end); sd_params->skip_layer_end,
(photomaker_image_data!=""?&photomaker_reference:nullptr));
} }
if (results == NULL) { if (results == NULL) {

View file

@ -328,7 +328,7 @@ public:
LOG_WARN( LOG_WARN(
"!!!It looks like you are using SDXL model. " "!!!It looks like you are using SDXL model. "
"If you find that the generated images are completely black, " "If you find that the generated images are completely black, "
"try specifying SDXL VAE FP16 Fix with the --vae parameter. " "try specifying a different VAE. "
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors"); "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
} }
} else if (sd_version_is_sd3(version)) { } else if (sd_version_is_sd3(version)) {
@ -1408,7 +1408,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float slg_scale = 0, float slg_scale = 0,
float skip_layer_start = 0.01, float skip_layer_start = 0.01,
float skip_layer_end = 0.2, float skip_layer_end = 0.2,
ggml_tensor* masked_image = NULL) { ggml_tensor* masked_image = NULL,
const sd_image_t* photomaker_reference = nullptr) {
if (seed < 0) { if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0. // Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@ -1451,6 +1452,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
ggml_tensor* init_img = NULL; ggml_tensor* init_img = NULL;
SDCondition id_cond; SDCondition id_cond;
std::vector<bool> class_tokens_mask; std::vector<bool> class_tokens_mask;
if (sd_ctx->sd->pmid_model && photomaker_reference!=nullptr)
{
sd_ctx->sd->stacked_id = true; //turn on photomaker if needed
}
if (sd_ctx->sd->stacked_id) { if (sd_ctx->sd->stacked_id) {
if (!sd_ctx->sd->pmid_lora->applied) { if (!sd_ctx->sd->pmid_lora->applied) {
t0 = ggml_time_ms(); t0 = ggml_time_ms();
@ -1493,6 +1498,30 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
input_id_images.push_back(input_image); input_id_images.push_back(input_image);
} }
} }
// handle single photomaker image passed in by kcpp
if (sd_ctx->sd->pmid_model && photomaker_reference!=nullptr)
{
int c = 0;
int width, height;
width = photomaker_reference->width;
height = photomaker_reference->height;
c = photomaker_reference->channel;
uint8_t* input_image_buffer = photomaker_reference->data;
sd_image_t* input_image = NULL;
input_image = new sd_image_t{(uint32_t)width,
(uint32_t)height,
3,
input_image_buffer};
input_image = preprocess_id_image(input_image);
if (input_image == NULL) {
LOG_ERROR("\npreprocess input id image from kcpp photomaker failed\n");
} else {
LOG_INFO("\nPhotoMaker loaded image from kcpp\n");
input_id_images.push_back(input_image);
}
}
if (input_id_images.size() > 0) { if (input_id_images.size() > 0) {
sd_ctx->sd->pmid_model->style_strength = style_ratio; sd_ctx->sd->pmid_model->style_strength = style_ratio;
int32_t w = input_id_images[0]->width; int32_t w = input_id_images[0]->width;
@ -1744,7 +1773,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0, size_t skip_layers_count = 0,
float slg_scale = 0, float slg_scale = 0,
float skip_layer_start = 0.01, float skip_layer_start = 0.01,
float skip_layer_end = 0.2) { float skip_layer_end = 0.2,
const sd_image_t* photomaker_reference = nullptr) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count); std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("txt2img %dx%d", width, height); LOG_DEBUG("txt2img %dx%d", width, height);
if (sd_ctx == NULL) { if (sd_ctx == NULL) {
@ -1822,7 +1852,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
skip_layers_vec, skip_layers_vec,
slg_scale, slg_scale,
skip_layer_start, skip_layer_start,
skip_layer_end); skip_layer_end,
nullptr,
photomaker_reference);
size_t t1 = ggml_time_ms(); size_t t1 = ggml_time_ms();
@ -1856,7 +1888,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0, size_t skip_layers_count = 0,
float slg_scale = 0, float slg_scale = 0,
float skip_layer_start = 0.01, float skip_layer_start = 0.01,
float skip_layer_end = 0.2) { float skip_layer_end = 0.2,
const sd_image_t* photomaker_reference = nullptr) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count); std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("img2img %dx%d", width, height); LOG_DEBUG("img2img %dx%d", width, height);
if (sd_ctx == NULL) { if (sd_ctx == NULL) {
@ -2002,7 +2035,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
slg_scale, slg_scale,
skip_layer_start, skip_layer_start,
skip_layer_end, skip_layer_end,
masked_image); masked_image,
photomaker_reference);
size_t t2 = ggml_time_ms(); size_t t2 = ggml_time_ms();