mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 11:40:43 +00:00
* chat: fix parallel_tool_calls default setting based on model capabilities, add tests for parallel tool calls and structured outputs * Fix ty errors. * Fix flake8 err
991 lines
32 KiB
Python
Executable file
991 lines
32 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Test parallel tool-calling capability via chat completions endpoint.
|
|
|
|
Only run this against models that actually support parallel tool calls — this
|
|
script does not attempt to toggle that setting on the server. Each scenario is
|
|
explicitly worded so that a capable model SHOULD emit multiple tool calls in a
|
|
single assistant turn (either the same tool N times, or several different
|
|
tools at once).
|
|
|
|
Each test case contains:
|
|
- tools: list of tool definitions (OpenAI-compatible)
|
|
- messages: initial conversation messages
|
|
- mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
|
|
- expected_parallel: dict describing what constitutes a successful parallel turn
|
|
{"min_parallel": int, # minimum tool_calls in one turn
|
|
"require_same_tool": Optional[str], # all parallel calls must be this tool
|
|
"require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
|
|
"min_distinct_args_key": Optional[str]} # parallel calls must span this
|
|
# many distinct values of this arg key
|
|
- validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import requests
|
|
import sys
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Color / formatting helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
RESET = "\x1b[0m"
|
|
BOLD = "\x1b[1m"
|
|
DIM = "\x1b[2m"
|
|
CYAN = "\x1b[36m"
|
|
YELLOW = "\x1b[33m"
|
|
GREEN = "\x1b[32m"
|
|
RED = "\x1b[31m"
|
|
BLUE = "\x1b[34m"
|
|
WHITE = "\x1b[97m"
|
|
MAGENTA = "\x1b[35m"
|
|
|
|
|
|
def _print(text="", end="\n"):
|
|
sys.stdout.write(text + end)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def print_header(title):
|
|
bar = "─" * 60
|
|
_print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
|
|
_print(
|
|
f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
|
|
)
|
|
_print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
|
|
|
|
|
|
def print_turn_banner(turn_idx, n_calls):
|
|
color = MAGENTA if n_calls >= 2 else DIM
|
|
_print(f"\n {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}")
|
|
|
|
|
|
def print_tool_call(name, args):
|
|
args_str = json.dumps(args)
|
|
_print(
|
|
f" {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}"
|
|
)
|
|
|
|
|
|
def print_tool_result(result):
|
|
preview = result[:140] + ("…" if len(result) > 140 else "")
|
|
_print(f" {DIM}{BLUE}↳ {preview}{RESET}")
|
|
|
|
|
|
def print_model_output(text):
|
|
sys.stdout.write(text)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def print_pass(reason):
|
|
_print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}")
|
|
|
|
|
|
def print_fail(reason):
|
|
_print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}")
|
|
|
|
|
|
def print_info(msg):
|
|
_print(f"{DIM}{msg}{RESET}")
|
|
|
|
|
|
def print_warn(msg):
|
|
_print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def chat_completion(url, messages, tools=None, stream=False):
|
|
payload = {
|
|
"messages": messages,
|
|
"stream": stream,
|
|
"max_tokens": 4096,
|
|
}
|
|
if tools:
|
|
payload["tools"] = tools
|
|
payload["tool_choice"] = "auto"
|
|
|
|
try:
|
|
response = requests.post(url, json=payload, stream=stream)
|
|
response.raise_for_status()
|
|
except requests.exceptions.RequestException as e:
|
|
body = e.response.content if (e.response is not None) else b""
|
|
print_fail(f"Request error: {e} | body: {body}")
|
|
return None
|
|
|
|
full_content = ""
|
|
reasoning_content = ""
|
|
tool_calls: list[dict] = []
|
|
|
|
if stream:
|
|
for line in response.iter_lines():
|
|
if not line:
|
|
continue
|
|
decoded = line.decode("utf-8")
|
|
if not decoded.startswith("data: "):
|
|
continue
|
|
data_str = decoded[6:]
|
|
if data_str == "[DONE]":
|
|
break
|
|
try:
|
|
data = json.loads(data_str)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
choices = data.get("choices", [])
|
|
if not choices:
|
|
continue
|
|
delta = choices[0].get("delta", {})
|
|
if delta.get("reasoning_content"):
|
|
reasoning_content += delta["reasoning_content"]
|
|
if delta.get("content"):
|
|
full_content += delta["content"]
|
|
print_model_output(delta["content"])
|
|
for tc in delta.get("tool_calls", []):
|
|
idx = tc.get("index", 0)
|
|
while len(tool_calls) <= idx:
|
|
tool_calls.append(
|
|
{
|
|
"id": "",
|
|
"type": "function",
|
|
"function": {"name": "", "arguments": ""},
|
|
}
|
|
)
|
|
if "id" in tc:
|
|
tool_calls[idx]["id"] += tc["id"]
|
|
if "function" in tc:
|
|
if "name" in tc["function"]:
|
|
tool_calls[idx]["function"]["name"] += tc["function"]["name"]
|
|
if "arguments" in tc["function"]:
|
|
tool_calls[idx]["function"]["arguments"] += tc["function"][
|
|
"arguments"
|
|
]
|
|
else:
|
|
data = response.json()
|
|
choices = data.get("choices", [])
|
|
if choices:
|
|
msg = choices[0].get("message", {})
|
|
full_content = msg.get("content") or ""
|
|
reasoning_content = msg.get("reasoning_content") or ""
|
|
tool_calls = msg.get("tool_calls") or []
|
|
if full_content:
|
|
print_model_output(full_content)
|
|
|
|
result = {"content": full_content, "tool_calls": tool_calls}
|
|
if reasoning_content:
|
|
result["reasoning_content"] = reasoning_content
|
|
return result
|
|
|
|
|
|
def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
|
|
"""
|
|
Drive the multi-turn tool-call loop, but record each turn's tool calls
|
|
separately so parallelism can be validated.
|
|
|
|
Returns (turns, all_tool_calls, final_content) where `turns` is a list
|
|
of dicts: {"index": int, "tool_calls": [...], "content": str}.
|
|
"""
|
|
msgs = list(messages)
|
|
turns: list[dict] = []
|
|
all_tool_calls: list[dict] = []
|
|
|
|
for turn_idx in range(max_turns):
|
|
result = chat_completion(url, msgs, tools=tools, stream=stream)
|
|
if result is None:
|
|
return turns, all_tool_calls, None
|
|
|
|
tcs = result.get("tool_calls") or []
|
|
content = result.get("content") or ""
|
|
|
|
turns.append(
|
|
{"index": turn_idx, "tool_calls": list(tcs), "content": content}
|
|
)
|
|
|
|
if not tcs:
|
|
if content:
|
|
_print(f"\n{DIM}{'·' * 60}{RESET}")
|
|
_print(f"{DIM} model response:{RESET}\n")
|
|
return turns, all_tool_calls, content
|
|
|
|
print_turn_banner(turn_idx, len(tcs))
|
|
all_tool_calls.extend(tcs)
|
|
|
|
assistant_msg: dict = {
|
|
"role": "assistant",
|
|
"content": content,
|
|
"tool_calls": tcs,
|
|
}
|
|
reasoning = result.get("reasoning_content")
|
|
if reasoning:
|
|
assistant_msg["reasoning_content"] = reasoning
|
|
msgs.append(assistant_msg)
|
|
|
|
for tc in tcs:
|
|
tool_name = tc["function"]["name"]
|
|
try:
|
|
args = json.loads(tc["function"]["arguments"])
|
|
except json.JSONDecodeError:
|
|
args = {}
|
|
|
|
print_tool_call(tool_name, args)
|
|
|
|
mock_fn = mock_tool_responses.get(tool_name)
|
|
if mock_fn:
|
|
tool_result = mock_fn(args)
|
|
else:
|
|
tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
|
|
|
|
print_tool_result(tool_result)
|
|
|
|
msgs.append(
|
|
{
|
|
"role": "tool",
|
|
"tool_call_id": tc.get("id", ""),
|
|
"content": tool_result,
|
|
}
|
|
)
|
|
|
|
return turns, all_tool_calls, None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parallelism helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _best_parallel_turn(turns):
|
|
"""Return the turn (dict) with the most tool calls, or None if no tools."""
|
|
tool_turns = [t for t in turns if t["tool_calls"]]
|
|
if not tool_turns:
|
|
return None
|
|
return max(tool_turns, key=lambda t: len(t["tool_calls"]))
|
|
|
|
|
|
def _distinct_tool_names(turn):
|
|
return {tc["function"]["name"] for tc in turn["tool_calls"]}
|
|
|
|
|
|
def _distinct_arg_values(turn, key):
|
|
values = set()
|
|
for tc in turn["tool_calls"]:
|
|
try:
|
|
args = json.loads(tc["function"]["arguments"])
|
|
except json.JSONDecodeError:
|
|
continue
|
|
v = args.get(key)
|
|
if v is not None:
|
|
if isinstance(v, str):
|
|
values.add(v.strip().lower())
|
|
else:
|
|
values.add(v)
|
|
return values
|
|
|
|
|
|
def _check_parallel(turns, expected):
|
|
"""
|
|
Check that at least one turn satisfies the parallel-call expectations.
|
|
Returns (ok, reason).
|
|
"""
|
|
best = _best_parallel_turn(turns)
|
|
if best is None:
|
|
return False, "No tool calls were made at all"
|
|
|
|
min_parallel = expected.get("min_parallel", 2)
|
|
if len(best["tool_calls"]) < min_parallel:
|
|
by_turn = [len(t["tool_calls"]) for t in turns]
|
|
return False, (
|
|
f"No turn had >= {min_parallel} parallel tool calls "
|
|
f"(per-turn counts: {by_turn})"
|
|
)
|
|
|
|
require_same = expected.get("require_same_tool")
|
|
if require_same is not None:
|
|
names = [tc["function"]["name"] for tc in best["tool_calls"]]
|
|
if any(n != require_same for n in names):
|
|
return False, (
|
|
f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
|
|
)
|
|
|
|
require_distinct = expected.get("require_distinct_tools")
|
|
if require_distinct is not None:
|
|
distinct = _distinct_tool_names(best)
|
|
if len(distinct) < require_distinct:
|
|
return False, (
|
|
f"Parallel turn had only {len(distinct)} distinct tool names "
|
|
f"({distinct}); need >= {require_distinct}"
|
|
)
|
|
|
|
distinct_key = expected.get("min_distinct_args_key")
|
|
distinct_count = expected.get("min_distinct_args_count", min_parallel)
|
|
if distinct_key is not None:
|
|
values = _distinct_arg_values(best, distinct_key)
|
|
if len(values) < distinct_count:
|
|
return False, (
|
|
f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
|
|
f"values ({values}); need >= {distinct_count}"
|
|
)
|
|
|
|
return True, (
|
|
f"Parallel turn had {len(best['tool_calls'])} calls across "
|
|
f"{len(_distinct_tool_names(best))} distinct tool(s)"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test case runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def run_test(url, test_case, stream):
|
|
name = test_case["name"]
|
|
mode = f"{'stream' if stream else 'non-stream'}"
|
|
print_header(f"{name} [{mode}]")
|
|
|
|
turns, all_tool_calls, final_content = run_agentic_loop(
|
|
url,
|
|
messages=test_case["messages"],
|
|
tools=test_case["tools"],
|
|
mock_tool_responses=test_case["mock_tool_responses"],
|
|
stream=stream,
|
|
)
|
|
|
|
if not turns:
|
|
print_fail("No response from server.")
|
|
return False
|
|
|
|
parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
|
|
if not parallel_ok:
|
|
print_fail(parallel_reason)
|
|
return False
|
|
|
|
passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
|
|
if passed:
|
|
print_pass(f"{parallel_reason}; {reason}")
|
|
else:
|
|
print_fail(reason)
|
|
return passed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test case definitions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----
|
|
|
|
_FILE_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "read_file",
|
|
"description": (
|
|
"Read the full contents of a file from the local filesystem. "
|
|
"Call this tool in parallel when asked to read several files — "
|
|
"each path needs its own call."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"path": {
|
|
"type": "string",
|
|
"description": "Absolute or repo-relative path to a file",
|
|
},
|
|
},
|
|
"required": ["path"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_FILE_CONTENTS = {
|
|
"config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
|
|
"config/redis.yml": "host: cache.internal\nport: 6379\ndb: 0\n",
|
|
"config/queue.yml": "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
|
|
"config/auth.yml": "provider: oidc\nissuer: https://auth.internal\n",
|
|
}
|
|
|
|
|
|
def _read_file_mock(args):
|
|
path = args.get("path", "")
|
|
norm = path.lstrip("./").lstrip("/")
|
|
content = _FILE_CONTENTS.get(norm)
|
|
if content is None:
|
|
for k, v in _FILE_CONTENTS.items():
|
|
if path.endswith(k):
|
|
content = v
|
|
break
|
|
if content is None:
|
|
return json.dumps({"path": path, "error": "not found"})
|
|
return json.dumps({"path": path, "content": content})
|
|
|
|
|
|
MULTIFILE_READ_TEST = {
|
|
"name": "Parallel multi-file read (same tool, 4 distinct paths)",
|
|
"tools": _FILE_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"Please read all four of these config files so I can review them "
|
|
"together: config/database.yml, config/redis.yml, config/queue.yml, "
|
|
"and config/auth.yml. Call read_file for every path in parallel in "
|
|
"a single batch — do NOT read them one by one sequentially across "
|
|
"turns. After you have all four, give me a one-line summary of each."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {"read_file": _read_file_mock},
|
|
"expected_parallel": {
|
|
"min_parallel": 4,
|
|
"require_same_tool": "read_file",
|
|
"min_distinct_args_key": "path",
|
|
"min_distinct_args_count": 4,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_multifile(turns, tcs, content):
|
|
del turns
|
|
if not content:
|
|
return False, "No final summary produced"
|
|
return True, f"{len(tcs)} total read_file calls; content length={len(content)}"
|
|
|
|
|
|
# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----
|
|
|
|
_TODO_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "mark_todo_complete",
|
|
"description": (
|
|
"Mark a single TODO item as complete by ID. When the user wants "
|
|
"several items marked at once, call this tool in parallel — "
|
|
"one call per item — rather than sequentially across turns."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"todo_id": {
|
|
"type": "string",
|
|
"description": "Identifier of the TODO item",
|
|
},
|
|
"note": {
|
|
"type": "string",
|
|
"description": "Optional completion note",
|
|
},
|
|
},
|
|
"required": ["todo_id"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_TODO_DB = {
|
|
"T-101": "Draft onboarding doc",
|
|
"T-102": "Update dependency lockfile",
|
|
"T-103": "Fix flaky login test",
|
|
"T-104": "Rotate service credentials",
|
|
"T-105": "Archive Q4 reports",
|
|
}
|
|
|
|
|
|
def _mark_todo_mock(args):
|
|
tid = args.get("todo_id", "")
|
|
if tid in _TODO_DB:
|
|
return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
|
|
return json.dumps({"todo_id": tid, "error": "unknown id"})
|
|
|
|
|
|
TODO_BATCH_TEST = {
|
|
"name": "Batch TODO completion (same tool, 5 IDs in one turn)",
|
|
"tools": _TODO_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"I finished every item on today's list. Please mark all of the "
|
|
"following TODOs as complete, in one parallel batch: T-101, T-102, "
|
|
"T-103, T-104, T-105. Don't mark them one at a time across separate "
|
|
"turns — issue all five mark_todo_complete calls at once. Afterwards "
|
|
"confirm which ones succeeded."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
|
|
"expected_parallel": {
|
|
"min_parallel": 5,
|
|
"require_same_tool": "mark_todo_complete",
|
|
"min_distinct_args_key": "todo_id",
|
|
"min_distinct_args_count": 5,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_todo(turns, tcs, content):
|
|
del turns
|
|
if not content:
|
|
return False, "No confirmation summary produced"
|
|
return True, f"{len(tcs)} total mark_todo_complete calls"
|
|
|
|
|
|
# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----
|
|
|
|
_WEATHER_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": (
|
|
"Fetch current weather for ONE city. When the user asks about "
|
|
"several cities, call this tool in parallel — one call per city — "
|
|
"instead of sequentially."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"city": {"type": "string", "description": "City name"},
|
|
"units": {
|
|
"type": "string",
|
|
"enum": ["metric", "imperial"],
|
|
"default": "metric",
|
|
},
|
|
},
|
|
"required": ["city"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_WEATHER_DB = {
|
|
"tokyo": {"city": "Tokyo", "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
|
|
"london": {"city": "London", "temp_c": 9.1, "condition": "overcast", "humidity": 81},
|
|
"new york": {"city": "New York", "temp_c": 12.7, "condition": "clear", "humidity": 55},
|
|
"paris": {"city": "Paris", "temp_c": 11.3, "condition": "light rain", "humidity": 78},
|
|
}
|
|
|
|
|
|
def _weather_mock(args):
|
|
city = args.get("city", "").strip().lower()
|
|
if city.startswith("new york"):
|
|
city = "new york"
|
|
if city in _WEATHER_DB:
|
|
return json.dumps(_WEATHER_DB[city])
|
|
return json.dumps({"city": args.get("city", ""), "error": "unknown city"})
|
|
|
|
|
|
MULTI_WEATHER_TEST = {
|
|
"name": "Parallel multi-city weather (same tool, 4 cities)",
|
|
"tools": _WEATHER_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"I'm comparing today's weather across four cities for a travel "
|
|
"decision: Tokyo, London, New York, and Paris. Please call "
|
|
"get_weather for all four in parallel in a single turn — don't "
|
|
"fetch them one at a time. Then rank them from warmest to coolest."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {"get_weather": _weather_mock},
|
|
"expected_parallel": {
|
|
"min_parallel": 4,
|
|
"require_same_tool": "get_weather",
|
|
"min_distinct_args_key": "city",
|
|
"min_distinct_args_count": 4,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_weather(turns, tcs, content):
|
|
del turns
|
|
if not content or not any(
|
|
kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
|
|
):
|
|
return False, f"Final content missing a ranking: {content!r}"
|
|
return True, f"{len(tcs)} total get_weather calls; ranking produced"
|
|
|
|
|
|
# ---- Test 4: Trip planning (different tools, parallel in one turn) ----
|
|
|
|
_TRIP_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "search_flights",
|
|
"description": "Search one-way flights between two airports on a given date.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
|
|
"to_airport": {"type": "string", "description": "IATA code, e.g. JFK"},
|
|
"date": {"type": "string", "description": "YYYY-MM-DD"},
|
|
},
|
|
"required": ["from_airport", "to_airport", "date"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "search_hotels",
|
|
"description": "Search hotels in a city for a date range.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"city": {"type": "string"},
|
|
"check_in": {"type": "string", "description": "YYYY-MM-DD"},
|
|
"check_out": {"type": "string", "description": "YYYY-MM-DD"},
|
|
"max_price": {"type": "integer"},
|
|
},
|
|
"required": ["city", "check_in", "check_out"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "search_restaurants",
|
|
"description": "Search restaurants in a city by cuisine.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"city": {"type": "string"},
|
|
"cuisine": {"type": "string"},
|
|
},
|
|
"required": ["city"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_FLIGHTS_RESULT = {
|
|
"results": [
|
|
{"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
|
|
{"flight": "AA 20", "depart": "10:00", "arrive": "18:35", "price": 412},
|
|
]
|
|
}
|
|
_HOTELS_RESULT = {
|
|
"results": [
|
|
{"name": "Midtown Grand", "nightly_rate": 245, "rating": 4.3},
|
|
{"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
|
|
]
|
|
}
|
|
_RESTAURANTS_RESULT = {
|
|
"results": [
|
|
{"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
|
|
{"name": "Osteria Blu", "cuisine": "italian", "rating": 4.4},
|
|
]
|
|
}
|
|
|
|
TRIP_PLAN_TEST = {
|
|
"name": "Trip planning (3 different tools in parallel)",
|
|
"tools": _TRIP_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
|
|
"(check out 2026-06-16). I'd also like some Italian restaurant "
|
|
"suggestions in New York. Please call search_flights, search_hotels, "
|
|
"and search_restaurants in parallel — all three in a single turn, "
|
|
"since they don't depend on each other. Then give me a concise "
|
|
"travel summary."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {
|
|
"search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
|
|
"search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
|
|
"search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
|
|
},
|
|
"expected_parallel": {
|
|
"min_parallel": 3,
|
|
"require_distinct_tools": 3,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_trip(turns, tcs, content):
|
|
del turns
|
|
names = {tc["function"]["name"] for tc in tcs}
|
|
required = {"search_flights", "search_hotels", "search_restaurants"}
|
|
missing = required - names
|
|
if missing:
|
|
return False, f"Missing tool calls: {missing}"
|
|
if not content:
|
|
return False, "No travel summary produced"
|
|
return True, f"All three tools called; summary length={len(content)}"
|
|
|
|
|
|
# ---- Test 5: Portfolio check (same tool, parallel tickers) ----
|
|
|
|
_STOCK_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_stock_quote",
|
|
"description": (
|
|
"Get the latest quote for ONE ticker. When the user asks about "
|
|
"multiple tickers, call this tool in parallel — one per symbol — "
|
|
"rather than sequentially."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"symbol": {"type": "string", "description": "Ticker symbol"},
|
|
},
|
|
"required": ["symbol"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_STOCK_DB = {
|
|
"AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
|
|
"MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
|
|
"GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
|
|
"AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
|
|
"NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
|
|
}
|
|
|
|
|
|
def _stock_mock(args):
|
|
sym = args.get("symbol", "").strip().upper()
|
|
if sym in _STOCK_DB:
|
|
return json.dumps(_STOCK_DB[sym])
|
|
return json.dumps({"symbol": sym, "error": "unknown ticker"})
|
|
|
|
|
|
PORTFOLIO_TEST = {
|
|
"name": "Portfolio check (same tool, 5 tickers in parallel)",
|
|
"tools": _STOCK_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"Pull the latest quote for every ticker in my portfolio — AAPL, "
|
|
"MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
|
|
"lookups are independent, so please don't chain them across turns. "
|
|
"Once you have all five, tell me which ticker had the biggest "
|
|
"percentage change today."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {"get_stock_quote": _stock_mock},
|
|
"expected_parallel": {
|
|
"min_parallel": 5,
|
|
"require_same_tool": "get_stock_quote",
|
|
"min_distinct_args_key": "symbol",
|
|
"min_distinct_args_count": 5,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_portfolio(turns, tcs, content):
|
|
del turns
|
|
if not content or ("nvda" not in content.lower() and "NVDA" not in content):
|
|
return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
|
|
return True, f"{len(tcs)} total quotes pulled"
|
|
|
|
|
|
# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----
|
|
|
|
_LANG_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "translate_text",
|
|
"description": "Translate a short text into a target language.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"text": {"type": "string"},
|
|
"target_language": {"type": "string",
|
|
"description": "ISO 639-1 language code, e.g. 'es'"},
|
|
},
|
|
"required": ["text", "target_language"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_definition",
|
|
"description": "Get the English dictionary definition of a word.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"word": {"type": "string"},
|
|
},
|
|
"required": ["word"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_synonyms",
|
|
"description": "Get English synonyms for a word.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"word": {"type": "string"},
|
|
},
|
|
"required": ["word"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
|
|
def _translate_mock(args):
|
|
t = args.get("text", "")
|
|
lang = args.get("target_language", "")
|
|
return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})
|
|
|
|
|
|
def _definition_mock(args):
|
|
w = args.get("word", "")
|
|
return json.dumps({
|
|
"word": w,
|
|
"definition": f"A standard dictionary definition of {w!r}.",
|
|
})
|
|
|
|
|
|
def _synonyms_mock(args):
|
|
w = args.get("word", "")
|
|
return json.dumps({
|
|
"word": w,
|
|
"synonyms": ["synonym_a", "synonym_b", "synonym_c"],
|
|
})
|
|
|
|
|
|
LANG_TOOLKIT_TEST = {
|
|
"name": "Language toolkit (translate + definition + synonyms in parallel)",
|
|
"tools": _LANG_TOOLS,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"For the English word 'resilient', I need three independent "
|
|
"look-ups at once: (a) translate it into Spanish, (b) fetch its "
|
|
"dictionary definition, and (c) list its synonyms. These three "
|
|
"calls don't depend on each other — please issue them in parallel "
|
|
"in a single turn. Then present the combined results as a short "
|
|
"language note."
|
|
),
|
|
}
|
|
],
|
|
"mock_tool_responses": {
|
|
"translate_text": _translate_mock,
|
|
"get_definition": _definition_mock,
|
|
"get_synonyms": _synonyms_mock,
|
|
},
|
|
"expected_parallel": {
|
|
"min_parallel": 3,
|
|
"require_distinct_tools": 3,
|
|
},
|
|
"validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
|
|
}
|
|
|
|
|
|
def _validate_lang(turns, tcs, content):
|
|
del turns
|
|
names = {tc["function"]["name"] for tc in tcs}
|
|
required = {"translate_text", "get_definition", "get_synonyms"}
|
|
missing = required - names
|
|
if missing:
|
|
return False, f"Missing tool calls: {missing}"
|
|
if not content:
|
|
return False, "No language note produced"
|
|
return True, f"All three lookup tools called; note length={len(content)}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# All test cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ALL_TEST_CASES = [
|
|
MULTIFILE_READ_TEST,
|
|
TODO_BATCH_TEST,
|
|
MULTI_WEATHER_TEST,
|
|
TRIP_PLAN_TEST,
|
|
PORTFOLIO_TEST,
|
|
LANG_TOOLKIT_TEST,
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description=(
|
|
"Test llama-server parallel tool-calling capability. Run this only "
|
|
"against models configured for parallel tool calls — this script "
|
|
"does not configure that itself."
|
|
)
|
|
)
|
|
parser.add_argument("--host", default="localhost")
|
|
parser.add_argument("--port", default=8080, type=int)
|
|
parser.add_argument(
|
|
"--no-stream", action="store_true", help="Disable streaming mode tests"
|
|
)
|
|
parser.add_argument(
|
|
"--stream-only", action="store_true", help="Only run streaming mode tests"
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
help="Run only the test whose name contains this substring (case-insensitive)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
url = f"http://{args.host}:{args.port}/v1/chat/completions"
|
|
print_info(f"Testing server at {url}")
|
|
print_warn(
|
|
"This script expects the target model to emit multiple tool calls in a "
|
|
"single assistant turn. Run it only against parallel-tool-capable models."
|
|
)
|
|
|
|
modes: list[bool] = []
|
|
if not args.stream_only:
|
|
modes.append(False)
|
|
if not args.no_stream:
|
|
modes.append(True)
|
|
|
|
cases: list[dict] = ALL_TEST_CASES
|
|
if args.test:
|
|
name_filter = args.test.lower()
|
|
cases = [c for c in cases if name_filter in str(c["name"]).lower()]
|
|
if not cases:
|
|
print_fail(f"No test cases matched '{args.test}'")
|
|
sys.exit(1)
|
|
|
|
total = 0
|
|
passed = 0
|
|
for stream in modes:
|
|
for case in cases:
|
|
total += 1
|
|
if run_test(url, case, stream=stream):
|
|
passed += 1
|
|
|
|
color = GREEN if passed == total else RED
|
|
_print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
|
|
_print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}")
|
|
_print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
|
|
sys.exit(0 if passed == total else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|