mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
add streaming support for oai tools (+2 squashed commit)
Squashed commit: [4d080b37] qwen2.5vl surgery script [4bebe7e5] add streaming support for oai tools
This commit is contained in:
parent
091eb367fc
commit
1ebadc515e
2 changed files with 129 additions and 66 deletions
|
@ -5,10 +5,12 @@ import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gguf import *
|
from gguf import *
|
||||||
from transformers import (
|
from transformers import (
|
||||||
Qwen2VLForConditionalGeneration,
|
|
||||||
Qwen2VLProcessor,
|
|
||||||
AutoProcessor,
|
AutoProcessor,
|
||||||
Qwen2VLConfig
|
Qwen2VLForConditionalGeneration,
|
||||||
|
Qwen2_5_VLForConditionalGeneration,
|
||||||
|
Qwen2VLProcessor,
|
||||||
|
Qwen2VLConfig,
|
||||||
|
Qwen2_5_VLConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,7 +20,9 @@ VISION = "clip.vision"
|
||||||
def k(raw_key: str, arch: str) -> str:
|
def k(raw_key: str, arch: str) -> str:
|
||||||
return raw_key.format(arch=arch)
|
return raw_key.format(arch=arch)
|
||||||
|
|
||||||
|
class VL2:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def to_gguf_name(name: str) -> str:
|
def to_gguf_name(name: str) -> str:
|
||||||
og = name
|
og = name
|
||||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||||
|
@ -31,8 +35,8 @@ def to_gguf_name(name: str) -> str:
|
||||||
print(f"[to_gguf_name] {og} --> {name}")
|
print(f"[to_gguf_name] {og} --> {name}")
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
vision_model = qwen2vl.visual
|
vision_model = qwen2vl.visual
|
||||||
tensor_map = {}
|
tensor_map = {}
|
||||||
for name, ten in vision_model.state_dict().items():
|
for name, ten in vision_model.state_dict().items():
|
||||||
|
@ -47,9 +51,9 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
wq = ten[:c]
|
wq = ten[:c]
|
||||||
wk = ten[c: c * 2]
|
wk = ten[c: c * 2]
|
||||||
wv = ten[c * 2:]
|
wv = ten[c * 2:]
|
||||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||||
elif 'merger' in name:
|
elif 'merger' in name:
|
||||||
if name.endswith("ln_q.weight"):
|
if name.endswith("ln_q.weight"):
|
||||||
tensor_map['v.post_ln.weight'] = ten
|
tensor_map['v.post_ln.weight'] = ten
|
||||||
|
@ -57,7 +61,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
tensor_map['v.post_ln.bias'] = ten
|
tensor_map['v.post_ln.bias'] = ten
|
||||||
else:
|
else:
|
||||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||||
tensor_map[to_gguf_name(name)] = ten
|
tensor_map[cls.to_gguf_name(name)] = ten
|
||||||
elif 'patch_embed.proj.weight' in name:
|
elif 'patch_embed.proj.weight' in name:
|
||||||
# NOTE: split Conv3D into Conv2Ds
|
# NOTE: split Conv3D into Conv2Ds
|
||||||
c1, c2, kt, kh, kw = ten.shape
|
c1, c2, kt, kh, kw = ten.shape
|
||||||
|
@ -65,7 +69,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||||
else:
|
else:
|
||||||
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
|
tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
|
||||||
|
|
||||||
for new_name, ten in tensor_map.items():
|
for new_name, ten in tensor_map.items():
|
||||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||||
|
@ -76,6 +80,22 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
return tensor_map
|
return tensor_map
|
||||||
|
|
||||||
|
|
||||||
|
class VL25(VL2):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def to_gguf_name(name: str) -> str:
|
||||||
|
og = name
|
||||||
|
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||||
|
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||||
|
name = name.replace("attn.", "attn_")
|
||||||
|
name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
|
||||||
|
name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
|
||||||
|
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||||
|
name = name.replace("merger.mlp", 'mm')
|
||||||
|
print(f"[vl25][to_gguf_name] {og} --> {name}")
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
if args.data_type == 'fp32':
|
if args.data_type == 'fp32':
|
||||||
dtype = torch.float32
|
dtype = torch.float32
|
||||||
|
@ -92,11 +112,18 @@ def main(args):
|
||||||
model_path = ""
|
model_path = ""
|
||||||
model_name = args.model_name
|
model_name = args.model_name
|
||||||
print("model_name: ", model_name)
|
print("model_name: ", model_name)
|
||||||
|
if args.model_type == "qwen2vl":
|
||||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||||
model_name, torch_dtype=dtype, device_map="cpu"
|
model_name, torch_dtype=dtype, device_map="cpu"
|
||||||
)
|
)
|
||||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||||
vcfg = cfg.vision_config
|
vcfg = cfg.vision_config
|
||||||
|
else:
|
||||||
|
qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||||
|
model_name, torch_dtype=dtype, device_map="cpu"
|
||||||
|
)
|
||||||
|
cfg: Qwen2_5_VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||||
|
vcfg = cfg.vision_config
|
||||||
|
|
||||||
if os.path.isdir(model_name):
|
if os.path.isdir(model_name):
|
||||||
local_model = True
|
local_model = True
|
||||||
|
@ -125,14 +152,26 @@ def main(args):
|
||||||
else:
|
else:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
|
if args.model_type == "qwen2.5vl":
|
||||||
|
fout.add_bool("clip.use_glu_mlp", True) # gate linear unit MLP layer in vision model
|
||||||
|
fout.add_bool("clip.use_rms_norm", True)
|
||||||
|
fout.add_array("clip.vision.fullatt_block_indexes", vcfg.fullatt_block_indexes)
|
||||||
|
fout.add_uint32("clip.vision.window_size", vcfg.window_size)
|
||||||
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
|
||||||
|
fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
|
||||||
|
else:
|
||||||
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||||
|
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||||
|
|
||||||
|
if args.model_type == "qwen2.5vl":
|
||||||
|
tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
|
||||||
|
else:
|
||||||
|
tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
|
||||||
for name, data in tensor_map.items():
|
for name, data in tensor_map.items():
|
||||||
fout.add_tensor(name, data)
|
fout.add_tensor(name, data)
|
||||||
|
|
||||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
|
||||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
|
||||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||||
|
@ -160,6 +199,7 @@ def main(args):
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||||
|
parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
|
||||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
29
koboldcpp.py
29
koboldcpp.py
|
@ -2018,8 +2018,8 @@ def transform_genparams(genparams, api_format):
|
||||||
#if auto mode, determine whether a tool is needed
|
#if auto mode, determine whether a tool is needed
|
||||||
tools_string = json.dumps(tools_array, indent=0)
|
tools_string = json.dumps(tools_array, indent=0)
|
||||||
should_use_tools = True
|
should_use_tools = True
|
||||||
user_start = adapter_obj.get("user_start", "### Instruction:\n\n")
|
user_start = user_message_start
|
||||||
user_end = adapter_obj.get("user_end", "\n\n### Response:\n\n")
|
user_end = assistant_message_start
|
||||||
if chosen_tool=="auto":
|
if chosen_tool=="auto":
|
||||||
temp_poll = {
|
temp_poll = {
|
||||||
"prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
|
"prompt": f"{user_start}User query:\n\n{messages_string}\n\nTool Code:\n{tools_string}Determine from the provided tool code if the user query would be best answered by a listed tool (One word: yes / no):{user_end}",
|
||||||
|
@ -2030,7 +2030,7 @@ def transform_genparams(genparams, api_format):
|
||||||
"ban_eos_token":False
|
"ban_eos_token":False
|
||||||
}
|
}
|
||||||
temp_poll_result = generate(genparams=temp_poll)
|
temp_poll_result = generate(genparams=temp_poll)
|
||||||
if temp_poll_result and not "yes" in temp_poll_result['text'].lower():
|
if temp_poll_result and "yes" not in temp_poll_result['text'].lower():
|
||||||
should_use_tools = False
|
should_use_tools = False
|
||||||
if not args.quiet:
|
if not args.quiet:
|
||||||
print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
|
print(f"\nRelevant tool is listed: {temp_poll_result['text']} ({should_use_tools})")
|
||||||
|
@ -2301,6 +2301,10 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
|
||||||
async def handle_sse_stream(self, genparams, api_format):
|
async def handle_sse_stream(self, genparams, api_format):
|
||||||
global friendlymodelname, currfinishreason
|
global friendlymodelname, currfinishreason
|
||||||
|
# if tools, do not send anything - OAI tool calls will be handled with fakestreaming!
|
||||||
|
using_openai_tools = genparams.get('using_openai_tools', False)
|
||||||
|
if api_format == 4 and using_openai_tools:
|
||||||
|
return
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header("X-Accel-Buffering", "no")
|
self.send_header("X-Accel-Buffering", "no")
|
||||||
self.send_header("cache-control", "no-cache")
|
self.send_header("cache-control", "no-cache")
|
||||||
|
@ -2311,6 +2315,7 @@ class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
incomplete_token_buffer = bytearray()
|
incomplete_token_buffer = bytearray()
|
||||||
async_sleep_short = 0.02
|
async_sleep_short = 0.02
|
||||||
await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
|
await asyncio.sleep(0.35) #anti race condition, prevent check from overtaking generate
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
|
tokenReserve = "" #keeps fully formed tokens that we cannot send out yet
|
||||||
while True:
|
while True:
|
||||||
|
@ -3188,6 +3193,24 @@ Enter Prompt:<br>
|
||||||
self.send_header('content-length', str(len(genresp)))
|
self.send_header('content-length', str(len(genresp)))
|
||||||
self.end_headers(content_type='application/json')
|
self.end_headers(content_type='application/json')
|
||||||
self.wfile.write(genresp)
|
self.wfile.write(genresp)
|
||||||
|
elif api_format == 4 and genparams.get('using_openai_tools', False): #special case, fake streaming for openai tool calls
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("X-Accel-Buffering", "no")
|
||||||
|
self.send_header("cache-control", "no-cache")
|
||||||
|
self.send_header("connection", "keep-alive")
|
||||||
|
self.end_headers(content_type='text/event-stream')
|
||||||
|
toolsdata_res = []
|
||||||
|
try:
|
||||||
|
toolsdata_res = gen['choices'][0]['message']['tool_calls']
|
||||||
|
except Exception:
|
||||||
|
toolsdata_res = []
|
||||||
|
toolsdata_p1 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":None,"delta":{'role':'assistant','content':None, "tool_calls":toolsdata_res}}]})
|
||||||
|
toolsdata_p2 = json.dumps({"id":"koboldcpp","object":"chat.completion.chunk","created":int(time.time()),"model":friendlymodelname,"choices":[{"index":0,"finish_reason":"tool_calls","delta":{}}]})
|
||||||
|
self.wfile.write(f'data: {toolsdata_p1}\n\n'.encode())
|
||||||
|
self.wfile.write(f'data: {toolsdata_p2}\n\n'.encode())
|
||||||
|
self.wfile.write('data: [DONE]'.encode())
|
||||||
|
self.wfile.flush()
|
||||||
|
self.close_connection = True
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
utfprint(ex,1)
|
utfprint(ex,1)
|
||||||
print("Generate: The response could not be sent, maybe connection was terminated?")
|
print("Generate: The response could not be sent, maybe connection was terminated?")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue