add streaming support for oai tools (+2 squashed commit)

Squashed commit:

[4d080b37] qwen2.5vl surgery script

[4bebe7e5] add streaming support for oai tools
This commit is contained in:
Concedo 2025-03-31 16:10:11 +08:00
parent 091eb367fc
commit 1ebadc515e
2 changed files with 129 additions and 66 deletions

View file

@ -5,10 +5,12 @@ import torch
import numpy as np import numpy as np
from gguf import * from gguf import *
from transformers import ( from transformers import (
Qwen2VLForConditionalGeneration,
Qwen2VLProcessor,
AutoProcessor, AutoProcessor,
Qwen2VLConfig Qwen2VLForConditionalGeneration,
Qwen2_5_VLForConditionalGeneration,
Qwen2VLProcessor,
Qwen2VLConfig,
Qwen2_5_VLConfig,
) )
@ -18,8 +20,10 @@ VISION = "clip.vision"
def k(raw_key: str, arch: str) -> str: def k(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch) return raw_key.format(arch=arch)
class VL2:
def to_gguf_name(name: str) -> str: @staticmethod
def to_gguf_name(name: str) -> str:
og = name og = name
name = name.replace("text_model", "t").replace("vision_model", "v") name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "") name = name.replace("blocks", "blk").replace("embeddings.", "")
@ -31,8 +35,8 @@ def to_gguf_name(name: str) -> str:
print(f"[to_gguf_name] {og} --> {name}") print(f"[to_gguf_name] {og} --> {name}")
return name return name
@classmethod
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]: def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual vision_model = qwen2vl.visual
tensor_map = {} tensor_map = {}
for name, ten in vision_model.state_dict().items(): for name, ten in vision_model.state_dict().items():
@ -47,9 +51,9 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
wq = ten[:c] wq = ten[:c]
wk = ten[c: c * 2] wk = ten[c: c * 2]
wv = ten[c * 2:] wv = ten[c * 2:]
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name: elif 'merger' in name:
if name.endswith("ln_q.weight"): if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten tensor_map['v.post_ln.weight'] = ten
@ -57,7 +61,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
tensor_map['v.post_ln.bias'] = ten tensor_map['v.post_ln.bias'] = ten
else: else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias" # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[to_gguf_name(name)] = ten tensor_map[cls.to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name: elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds # NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape c1, c2, kt, kh, kw = ten.shape
@ -65,7 +69,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else: else:
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
for new_name, ten in tensor_map.items(): for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"): if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
@ -76,6 +80,22 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
return tensor_map return tensor_map
class VL25(VL2):
@staticmethod
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[vl25][to_gguf_name] {og} --> {name}")
return name
def main(args): def main(args):
if args.data_type == 'fp32': if args.data_type == 'fp32':
dtype = torch.float32 dtype = torch.float32
@ -92,11 +112,18 @@ def main(args):
model_path = "" model_path = ""
model_name = args.model_name model_name = args.model_name
print("model_name: ", model_name) print("model_name: ", model_name)
if args.model_type == "qwen2vl":
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu" model_name, torch_dtype=dtype, device_map="cpu"
) )
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType] cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config vcfg = cfg.vision_config
else:
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if os.path.isdir(model_name): if os.path.isdir(model_name):
local_model = True local_model = True
@ -125,14 +152,26 @@ def main(args):
else: else:
raise ValueError() raise ValueError()
tensor_map = find_vision_tensors(qwen2vl, np_dtype) if args.model_type == "qwen2.5vl":
fout.add_bool("clip.use_glu_mlp", True) # gate linear unit MLP layer in vision model
fout.add_bool("clip.use_rms_norm", True)
fout.add_array("clip.vision.fullatt_block_indexes", vcfg.fullatt_block_indexes)
fout.add_uint32("clip.vision.window_size", vcfg.window_size)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
else:
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
if args.model_type == "qwen2.5vl":
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
else:
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
for name, data in tensor_map.items(): for name, data in tensor_map.items():
fout.add_tensor(name, data) fout.add_tensor(name, data)
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2) fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
@ -160,6 +199,7 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct") parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View file

@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format):
#if auto mode, determine whether a tool is needed #if auto mode, determine whether a tool is needed
tools_string = json.dumps(tools_array, indent=0) tools_string = json.dumps(tools_array, indent=0)
should_use_tools = True should_use_tools = True
user_start = adapter_obj.get("user_start", "### Instruction:\n\n") user_start = user_message_start
user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n") user_end = assistant_message_start
if chosen_tool=="auto": if chosen_tool=="auto":
temp_poll = { temp_poll = {
"prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}", "prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format):
"ban_eos_token":False "ban_eos_token":False
} }
temp_poll_result = generate(genparams=temp_poll) temp_poll_result = generate(genparams=temp_poll)
if temp_poll_result and not "yes" in temp_poll_result['text'].lower(): if temp_poll_result and "yes" not in temp_poll_result['text'].lower():
should_use_tools = False should_use_tools = False
if not args.quiet: if not args.quiet:
print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})") print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
@ -2301,6 +2301,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
async def handle_sse_stream(self, genparams, api_format): async def handle_sse_stream(self, genparams, api_format):
global friendlymodelname, currfinishreason global friendlymodelname, currfinishreason
# if tools, do not send anything - OAI tool calls will be handled with fakestreaming!
using_openai_tools = genparams.get('using_openai_tools', False)
if api_format == 4 and using_openai_tools:
return
self.send_response(200) self.send_response(200)
self.send_header("X-Accel-Buffering", "no") self.send_header("X-Accel-Buffering", "no")
self.send_header("cache-control", "no-cache") self.send_header("cache-control", "no-cache")
@ -2311,6 +2315,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
incomplete_token_buffer = bytearray() incomplete_token_buffer = bytearray()
async_sleep_short = 0.02 async_sleep_short = 0.02
await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
try: try:
tokenReserve = "" #keeps fully formed tokens that we cannot send out yet tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
while True: while True:
@ -3188,6 +3193,24 @@ Enter Prompt:<br>
self.send_header('content-length', str(len(genresp))) self.send_header('content-length', str(len(genresp)))
self.end_headers(content_type='application/json') self.end_headers(content_type='application/json')
self.wfile.write(genresp) self.wfile.write(genresp)
elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
self.send_response(200)
self.send_header("X-Accel-Buffering", "no")
self.send_header("cache-control", "no-cache")
self.send_header("connection", "keep-alive")
self.end_headers(content_type='text/event-stream')
toolsdata_res = []
try:
toolsdata_res = gen['choices'][0]['message']['tool_calls']
except Exception:
toolsdata_res = []
toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]})
toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]})
self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode())
self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode())
self.wfile.write('data: [DONE]'.encode())
self.wfile.flush()
self.close_connection = True
except Exception as ex: except Exception as ex:
utfprint(ex,1) utfprint(ex,1)
print("Generate: The response could not be sent, maybe connection was terminated?") print("Generate: The response could not be sent, maybe connection was terminated?")