From 1ebadc515e13708a00cf964e5825f6682cd27084 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:10:11 +0800 Subject: [PATCH] add streaming support for oai tools (+2 squashed commit) Squashed commit: [4d080b37] qwen2.5vl surgery script [4bebe7e5] add streaming support for oai tools --- examples/llava/qwen2_vl_surgery.py | 166 ++++++++++++++++++----------- koboldcpp.py | 29 ++++- 2 files changed, 129 insertions(+), 66 deletions(-) diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py index c87606b4f..8f7a94e5c 100644 --- a/examples/llava/qwen2_vl_surgery.py +++ b/examples/llava/qwen2_vl_surgery.py @@ -5,10 +5,12 @@ import torch import numpy as np from gguf import * from transformers import ( - Qwen2VLForConditionalGeneration, - Qwen2VLProcessor, AutoProcessor, - Qwen2VLConfig + Qwen2VLForConditionalGeneration, + Qwen2_5_VLForConditionalGeneration, + Qwen2VLProcessor, + Qwen2VLConfig, + Qwen2_5_VLConfig, ) @@ -18,62 +20,80 @@ VISION = "clip.vision" def k(raw_key: str, arch: str) -> str: return raw_key.format(arch=arch) +class VL2: -def to_gguf_name(name: str) -> str: - og = name - name = name.replace("text_model", "t").replace("vision_model", "v") - name = name.replace("blocks", "blk").replace("embeddings.", "") - name = name.replace("attn.", "attn_") - name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") - # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") - name = name.replace("norm1", "ln1").replace("norm2", "ln2") - name = name.replace("merger.mlp", 'mm') - print(f"[to_gguf_name] {og} --> {name}") - return name + @staticmethod + def to_gguf_name(name: str) -> str: + og = name + name = name.replace("text_model", "t").replace("vision_model", "v") + name = name.replace("blocks", "blk").replace("embeddings.", "") + name = name.replace("attn.", "attn_") + name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.") + # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln") + name = name.replace("norm1", "ln1").replace("norm2", "ln2") + name = name.replace("merger.mlp", 'mm') + print(f"[to_gguf_name] {og} --> {name}") + return name - -def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]: - vision_model = qwen2vl.visual - tensor_map = {} - for name, ten in vision_model.state_dict().items(): - ten = ten.numpy() - if 'qkv' in name: - if ten.ndim == 2: # weight - c3, _ = ten.shape - else: # bias - c3 = ten.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = ten[:c] - wk = ten[c: c * 2] - wv = ten[c * 2:] - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk - tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv - elif 'merger' in name: - if name.endswith("ln_q.weight"): - tensor_map['v.post_ln.weight'] = ten - elif name.endswith("ln_q.bias"): - tensor_map['v.post_ln.bias'] = ten + @classmethod + def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]: + vision_model = qwen2vl.visual + tensor_map = {} + for name, ten in vision_model.state_dict().items(): + ten = ten.numpy() + if 'qkv' in name: + if ten.ndim == 2: # weight + c3, _ = ten.shape + else: # bias + c3 = ten.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = ten[:c] + wk = ten[c: c * 2] + wv = ten[c * 2:] + tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq + tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk + tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv + elif 'merger' in name: + if name.endswith("ln_q.weight"): + tensor_map['v.post_ln.weight'] = ten + elif name.endswith("ln_q.bias"): + tensor_map['v.post_ln.bias'] = ten + else: + # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" + tensor_map[cls.to_gguf_name(name)] = ten + elif 'patch_embed.proj.weight' in name: + # NOTE: split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = ten.shape + assert kt == 2, "Current implmentation only support temporal_patch_size of 2" + tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] + tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] else: - # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" - tensor_map[to_gguf_name(name)] = ten - elif 'patch_embed.proj.weight' in name: - # NOTE: split Conv3D into Conv2Ds - c1, c2, kt, kh, kw = ten.shape - assert kt == 2, "Current implmentation only support temporal_patch_size of 2" - tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] - tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] - else: - tensor_map[to_gguf_name(f"vision_model.{name}")] = ten + tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten - for new_name, ten in tensor_map.items(): - if ten.ndim <= 1 or new_name.endswith("_norm.weight"): - tensor_map[new_name] = ten.astype(np.float32) - else: - tensor_map[new_name] = ten.astype(dtype) - tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder - return tensor_map + for new_name, ten in tensor_map.items(): + if ten.ndim <= 1 or new_name.endswith("_norm.weight"): + tensor_map[new_name] = ten.astype(np.float32) + else: + tensor_map[new_name] = ten.astype(dtype) + tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder + return tensor_map + + +class VL25(VL2): + + @staticmethod + def to_gguf_name(name: str) -> str: + og = name + name = name.replace("text_model", "t").replace("vision_model", "v") + name = name.replace("blocks", "blk").replace("embeddings.", "") + name = name.replace("attn.", "attn_") + name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up") + name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.") + name = name.replace("norm1", "ln1").replace("norm2", "ln2") + name = name.replace("merger.mlp", 'mm') + print(f"[vl25][to_gguf_name] {og} --> {name}") + return name def main(args): @@ -92,11 +112,18 @@ def main(args): model_path = "" model_name = args.model_name print("model_name: ", model_name) - qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( - model_name, torch_dtype=dtype, device_map="cpu" - ) - cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] - vcfg = cfg.vision_config + if args.model_type == "qwen2vl": + qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=dtype, device_map="cpu" + ) + cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] + vcfg = cfg.vision_config + else: + qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=dtype, device_map="cpu" + ) + cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] + vcfg = cfg.vision_config if os.path.isdir(model_name): local_model = True @@ -125,14 +152,26 @@ def main(args): else: raise ValueError() - tensor_map = find_vision_tensors(qwen2vl, np_dtype) + if args.model_type == "qwen2.5vl": + fout.add_bool("clip.use_glu_mlp", True) # gate linear unit MLP layer in vision model + fout.add_bool("clip.use_rms_norm", True) + fout.add_array("clip.vision.fullatt_block_indexes", vcfg.fullatt_block_indexes) + fout.add_uint32("clip.vision.window_size", vcfg.window_size) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size) + fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size) + else: + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) + fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) + + if args.model_type == "qwen2.5vl": + tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype) + else: + tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype) for name, data in tensor_map.items(): fout.add_tensor(name, data) fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim) - fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) @@ -160,6 +199,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") + parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl") parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") args = parser.parse_args() main(args) diff --git a/koboldcpp.py b/koboldcpp.py index 4705d0f0f..83cac7e19 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format): #if auto mode, determine whether a tool is needed tools_string = json.dumps(tools_array, indent=0) should_use_tools = True - user_start = adapter_obj.get("user_start", "### Instruction:\n\n") - user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n") + user_start = user_message_start + user_end = assistant_message_start if chosen_tool=="auto": temp_poll = { "prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}", @@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format): "ban_eos_token":False } temp_poll_result = generate(genparams=temp_poll) - if temp_poll_result and not "yes" in temp_poll_result['text'].lower(): + if temp_poll_result and "yes" not in temp_poll_result['text'].lower(): should_use_tools = False if not args.quiet: print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})") @@ -2301,6 +2301,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): async def handle_sse_stream(self, genparams, api_format): global friendlymodelname, currfinishreason + # if tools, do not send anything - OAI tool calls will be handled with fakestreaming! + using_openai_tools = genparams.get('using_openai_tools', False) + if api_format == 4 and using_openai_tools: + return self.send_response(200) self.send_header("X-Accel-Buffering", "no") self.send_header("cache-control", "no-cache") @@ -2311,6 +2315,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler): incomplete_token_buffer = bytearray() async_sleep_short = 0.02 await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate + try: tokenReserve = "" #keeps fully formed tokens that we cannot send out yet while True: @@ -3188,6 +3193,24 @@ Enter Prompt:
self.send_header('content-length', str(len(genresp))) self.end_headers(content_type='application/json') self.wfile.write(genresp) + elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls + self.send_response(200) + self.send_header("X-Accel-Buffering", "no") + self.send_header("cache-control", "no-cache") + self.send_header("connection", "keep-alive") + self.end_headers(content_type='text/event-stream') + toolsdata_res = [] + try: + toolsdata_res = gen['choices'][0]['message']['tool_calls'] + except Exception: + toolsdata_res = [] + toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]}) + toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]}) + self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode()) + self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode()) + self.wfile.write('data: [DONE]'.encode()) + self.wfile.flush() + self.close_connection = True except Exception as ex: utfprint(ex,1) print("Generate: The response could not be sent, maybe connection was terminated?")