add streaming support for oai tools (+2 squashed commit)

Squashed commit: [4d080b37] qwen2.5vl surgery script [4bebe7e5] add streaming support for oai tools
2025-09-11 01:24:36 +00:00 · 2025-03-31 16:10:11 +08:00 · 2025-03-31 16:10:11 +08:00 · 1ebadc515e
commit 1ebadc515e
parent 091eb367fc
2 changed files with 129 additions and 66 deletions
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@ -5,10 +5,12 @@ import torch
 import numpy as np
 from gguf import *
 from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    AutoProcessor,
-    Qwen2VLConfig
+    Qwen2VLForConditionalGeneration,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLProcessor,
    Qwen2VLConfig,
    Qwen2_5_VLConfig,
 )
@ -18,8 +20,10 @@ VISION = "clip.vision"
 def k(raw_key: str, arch: str) -> str:
    return raw_key.format(arch=arch)
 class VL2:
-def to_gguf_name(name: str) -> str:
+    @staticmethod
    def to_gguf_name(name: str) -> str:
        og = name
        name = name.replace("text_model", "t").replace("vision_model", "v")
        name = name.replace("blocks", "blk").replace("embeddings.", "")
@ -31,8 +35,8 @@ def to_gguf_name(name: str) -> str:
        print(f"[to_gguf_name] {og} --> {name}")
        return name
-
+    @classmethod
-def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
+    def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
        vision_model = qwen2vl.visual
        tensor_map = {}
        for name, ten in vision_model.state_dict().items():
@ -47,9 +51,9 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
                wq = ten[:c]
                wk = ten[c: c * 2]
                wv = ten[c * 2:]
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
            elif 'merger' in name:
                if name.endswith("ln_q.weight"):
                    tensor_map['v.post_ln.weight'] = ten
@ -57,7 +61,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
                    tensor_map['v.post_ln.bias'] = ten
                else:
                    # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
-                tensor_map[to_gguf_name(name)] = ten
+                    tensor_map[cls.to_gguf_name(name)] = ten
            elif 'patch_embed.proj.weight' in name:
                # NOTE: split Conv3D into Conv2Ds
                c1, c2, kt, kh, kw = ten.shape
@ -65,7 +69,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
                tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
                tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
            else:
-            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
        for new_name, ten in tensor_map.items():
            if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
@ -76,6 +80,22 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
        return tensor_map
 class VL25(VL2):
    @staticmethod
    def to_gguf_name(name: str) -> str:
        og = name
        name = name.replace("text_model", "t").replace("vision_model", "v")
        name = name.replace("blocks", "blk").replace("embeddings.", "")
        name = name.replace("attn.", "attn_")
        name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
        name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
        name = name.replace("norm1", "ln1").replace("norm2", "ln2")
        name = name.replace("merger.mlp", 'mm')
        print(f"[vl25][to_gguf_name] {og} --> {name}")
        return name
 def main(args):
    if args.data_type == 'fp32':
        dtype = torch.float32
@ -92,11 +112,18 @@ def main(args):
    model_path = ""
    model_name = args.model_name
    print("model_name: ", model_name)
    if args.model_type == "qwen2vl":
        qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name, torch_dtype=dtype, device_map="cpu"
        )
        cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
        vcfg = cfg.vision_config
    else:
        qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_name, torch_dtype=dtype, device_map="cpu"
        )
        cfg: Qwen2_5_VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
        vcfg = cfg.vision_config
    if os.path.isdir(model_name):
        local_model = True
@ -125,14 +152,26 @@ def main(args):
    else:
        raise ValueError()
-    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
+    if args.model_type == "qwen2.5vl":
        fout.add_bool("clip.use_glu_mlp", True)  # gate linear unit MLP layer in vision model
        fout.add_bool("clip.use_rms_norm", True)
        fout.add_array("clip.vision.fullatt_block_indexes", vcfg.fullatt_block_indexes)
        fout.add_uint32("clip.vision.window_size", vcfg.window_size)
        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
        fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
    else:
        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
        fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
    if args.model_type == "qwen2.5vl":
        tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
    else:
        tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
    for name, data in tensor_map.items():
        fout.add_tensor(name, data)
    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
    fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
@ -160,6 +199,7 @@ def main(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
    parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
    args = parser.parse_args()
    main(args)
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format):
                        #if auto mode, determine whether a tool is needed
                        tools_string = json.dumps(tools_array, indent=0)
                        should_use_tools = True
-                        user_start = adapter_obj.get("user_start", "### Instruction:\n\n")
+                        user_start = user_message_start
-                        user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n")
+                        user_end = assistant_message_start
                        if chosen_tool=="auto":
                            temp_poll = {
                                "prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format):
                                "ban_eos_token":False
                                }
                            temp_poll_result = generate(genparams=temp_poll)
-                            if temp_poll_result and not "yes" in temp_poll_result['text'].lower():
+                            if temp_poll_result and "yes" not in temp_poll_result['text'].lower():
                                should_use_tools = False
                            if not args.quiet:
                                print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
@ -2301,6 +2301,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
    async def handle_sse_stream(self, genparams, api_format):
        global friendlymodelname, currfinishreason
        # if tools, do not send anything - OAI tool calls will be handled with fakestreaming!
        using_openai_tools = genparams.get('using_openai_tools', False)
        if api_format == 4 and using_openai_tools:
            return
        self.send_response(200)
        self.send_header("X-Accel-Buffering", "no")
        self.send_header("cache-control", "no-cache")
@ -2311,6 +2315,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
        incomplete_token_buffer = bytearray()
        async_sleep_short = 0.02
        await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
        try:
            tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
            while True:
@ -3188,6 +3193,24 @@ Enter Prompt:<br>
                            self.send_header('content-length', str(len(genresp)))
                            self.end_headers(content_type='application/json')
                            self.wfile.write(genresp)
                        elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
                            self.send_response(200)
                            self.send_header("X-Accel-Buffering", "no")
                            self.send_header("cache-control", "no-cache")
                            self.send_header("connection", "keep-alive")
                            self.end_headers(content_type='text/event-stream')
                            toolsdata_res = []
                            try:
                                toolsdata_res = gen['choices'][0]['message']['tool_calls']
                            except Exception:
                                toolsdata_res = []
                            toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]})
                            toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]})
                            self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode())
                            self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode())
                            self.wfile.write('data: [DONE]'.encode())
                            self.wfile.flush()
                            self.close_connection = True
                    except Exception as ex:
                        utfprint(ex,1)
                        print("Generate: The response could not be sent, maybe connection was terminated?")