koboldcpp/scripts/server-test-parallel-tc.py

#!/usr/bin/env python3
"""
Test parallel tool-calling capability via chat completions endpoint.

Only run this against models that actually support parallel tool calls — this
script does not attempt to toggle that setting on the server. Each scenario is
explicitly worded so that a capable model SHOULD emit multiple tool calls in a
single assistant turn (either the same tool N times, or several different
tools at once).

Each test case contains:
  - tools: list of tool definitions (OpenAI-compatible)
  - messages: initial conversation messages
  - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
  - expected_parallel: dict describing what constitutes a successful parallel turn
        {"min_parallel": int,                # minimum tool_calls in one turn
         "require_same_tool": Optional[str], # all parallel calls must be this tool
         "require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
         "min_distinct_args_key": Optional[str]}  # parallel calls must span this
                                                   # many distinct values of this arg key
  - validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
"""

import argparse
import json
import requests
import sys

# ---------------------------------------------------------------------------
# Color / formatting helpers
# ---------------------------------------------------------------------------

RESET = "\x1b[0m"
BOLD = "\x1b[1m"
DIM = "\x1b[2m"
CYAN = "\x1b[36m"
YELLOW = "\x1b[33m"
GREEN = "\x1b[32m"
RED = "\x1b[31m"
BLUE = "\x1b[34m"
WHITE = "\x1b[97m"
MAGENTA = "\x1b[35m"


def _print(text="", end="\n"):
    sys.stdout.write(text + end)
    sys.stdout.flush()


def print_header(title):
    bar = "─" * 60
    _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
    _print(
        f"{BOLD}{CYAN}│  {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
    )
    _print(f"{BOLD}{CYAN}└{bar}┘{RESET}")


def print_turn_banner(turn_idx, n_calls):
    color = MAGENTA if n_calls >= 2 else DIM
    _print(f"\n  {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}")


def print_tool_call(name, args):
    args_str = json.dumps(args)
    _print(
        f"    {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}"
    )


def print_tool_result(result):
    preview = result[:140] + ("…" if len(result) > 140 else "")
    _print(f"      {DIM}{BLUE}↳ {preview}{RESET}")


def print_model_output(text):
    sys.stdout.write(text)
    sys.stdout.flush()


def print_pass(reason):
    _print(f"\n{BOLD}{GREEN}✔ PASS{RESET}  {reason}")


def print_fail(reason):
    _print(f"\n{BOLD}{RED}✘ FAIL{RESET}  {reason}")


def print_info(msg):
    _print(f"{DIM}{msg}{RESET}")


def print_warn(msg):
    _print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}")


# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------


def chat_completion(url, messages, tools=None, stream=False):
    payload = {
        "messages": messages,
        "stream": stream,
        "max_tokens": 4096,
    }
    if tools:
        payload["tools"] = tools
        payload["tool_choice"] = "auto"

    try:
        response = requests.post(url, json=payload, stream=stream)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        body = e.response.content if (e.response is not None) else b""
        print_fail(f"Request error: {e} | body: {body}")
        return None

    full_content = ""
    reasoning_content = ""
    tool_calls: list[dict] = []

    if stream:
        for line in response.iter_lines():
            if not line:
                continue
            decoded = line.decode("utf-8")
            if not decoded.startswith("data: "):
                continue
            data_str = decoded[6:]
            if data_str == "[DONE]":
                break
            try:
                data = json.loads(data_str)
            except json.JSONDecodeError:
                continue
            choices = data.get("choices", [])
            if not choices:
                continue
            delta = choices[0].get("delta", {})
            if delta.get("reasoning_content"):
                reasoning_content += delta["reasoning_content"]
            if delta.get("content"):
                full_content += delta["content"]
                print_model_output(delta["content"])
            for tc in delta.get("tool_calls", []):
                idx = tc.get("index", 0)
                while len(tool_calls) <= idx:
                    tool_calls.append(
                        {
                            "id": "",
                            "type": "function",
                            "function": {"name": "", "arguments": ""},
                        }
                    )
                if "id" in tc:
                    tool_calls[idx]["id"] += tc["id"]
                if "function" in tc:
                    if "name" in tc["function"]:
                        tool_calls[idx]["function"]["name"] += tc["function"]["name"]
                    if "arguments" in tc["function"]:
                        tool_calls[idx]["function"]["arguments"] += tc["function"][
                            "arguments"
                        ]
    else:
        data = response.json()
        choices = data.get("choices", [])
        if choices:
            msg = choices[0].get("message", {})
            full_content = msg.get("content") or ""
            reasoning_content = msg.get("reasoning_content") or ""
            tool_calls = msg.get("tool_calls") or []
            if full_content:
                print_model_output(full_content)

    result = {"content": full_content, "tool_calls": tool_calls}
    if reasoning_content:
        result["reasoning_content"] = reasoning_content
    return result


def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
    """
    Drive the multi-turn tool-call loop, but record each turn's tool calls
    separately so parallelism can be validated.

    Returns (turns, all_tool_calls, final_content) where `turns` is a list
    of dicts: {"index": int, "tool_calls": [...], "content": str}.
    """
    msgs = list(messages)
    turns: list[dict] = []
    all_tool_calls: list[dict] = []

    for turn_idx in range(max_turns):
        result = chat_completion(url, msgs, tools=tools, stream=stream)
        if result is None:
            return turns, all_tool_calls, None

        tcs = result.get("tool_calls") or []
        content = result.get("content") or ""

        turns.append(
            {"index": turn_idx, "tool_calls": list(tcs), "content": content}
        )

        if not tcs:
            if content:
                _print(f"\n{DIM}{'·' * 60}{RESET}")
                _print(f"{DIM}  model response:{RESET}\n")
            return turns, all_tool_calls, content

        print_turn_banner(turn_idx, len(tcs))
        all_tool_calls.extend(tcs)

        assistant_msg: dict = {
            "role": "assistant",
            "content": content,
            "tool_calls": tcs,
        }
        reasoning = result.get("reasoning_content")
        if reasoning:
            assistant_msg["reasoning_content"] = reasoning
        msgs.append(assistant_msg)

        for tc in tcs:
            tool_name = tc["function"]["name"]
            try:
                args = json.loads(tc["function"]["arguments"])
            except json.JSONDecodeError:
                args = {}

            print_tool_call(tool_name, args)

            mock_fn = mock_tool_responses.get(tool_name)
            if mock_fn:
                tool_result = mock_fn(args)
            else:
                tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})

            print_tool_result(tool_result)

            msgs.append(
                {
                    "role": "tool",
                    "tool_call_id": tc.get("id", ""),
                    "content": tool_result,
                }
            )

    return turns, all_tool_calls, None


# ---------------------------------------------------------------------------
# Parallelism helpers
# ---------------------------------------------------------------------------


def _best_parallel_turn(turns):
    """Return the turn (dict) with the most tool calls, or None if no tools."""
    tool_turns = [t for t in turns if t["tool_calls"]]
    if not tool_turns:
        return None
    return max(tool_turns, key=lambda t: len(t["tool_calls"]))


def _distinct_tool_names(turn):
    return {tc["function"]["name"] for tc in turn["tool_calls"]}


def _distinct_arg_values(turn, key):
    values = set()
    for tc in turn["tool_calls"]:
        try:
            args = json.loads(tc["function"]["arguments"])
        except json.JSONDecodeError:
            continue
        v = args.get(key)
        if v is not None:
            if isinstance(v, str):
                values.add(v.strip().lower())
            else:
                values.add(v)
    return values


def _check_parallel(turns, expected):
    """
    Check that at least one turn satisfies the parallel-call expectations.
    Returns (ok, reason).
    """
    best = _best_parallel_turn(turns)
    if best is None:
        return False, "No tool calls were made at all"

    min_parallel = expected.get("min_parallel", 2)
    if len(best["tool_calls"]) < min_parallel:
        by_turn = [len(t["tool_calls"]) for t in turns]
        return False, (
            f"No turn had >= {min_parallel} parallel tool calls "
            f"(per-turn counts: {by_turn})"
        )

    require_same = expected.get("require_same_tool")
    if require_same is not None:
        names = [tc["function"]["name"] for tc in best["tool_calls"]]
        if any(n != require_same for n in names):
            return False, (
                f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
            )

    require_distinct = expected.get("require_distinct_tools")
    if require_distinct is not None:
        distinct = _distinct_tool_names(best)
        if len(distinct) < require_distinct:
            return False, (
                f"Parallel turn had only {len(distinct)} distinct tool names "
                f"({distinct}); need >= {require_distinct}"
            )

    distinct_key = expected.get("min_distinct_args_key")
    distinct_count = expected.get("min_distinct_args_count", min_parallel)
    if distinct_key is not None:
        values = _distinct_arg_values(best, distinct_key)
        if len(values) < distinct_count:
            return False, (
                f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
                f"values ({values}); need >= {distinct_count}"
            )

    return True, (
        f"Parallel turn had {len(best['tool_calls'])} calls across "
        f"{len(_distinct_tool_names(best))} distinct tool(s)"
    )


# ---------------------------------------------------------------------------
# Test case runner
# ---------------------------------------------------------------------------


def run_test(url, test_case, stream):
    name = test_case["name"]
    mode = f"{'stream' if stream else 'non-stream'}"
    print_header(f"{name}  [{mode}]")

    turns, all_tool_calls, final_content = run_agentic_loop(
        url,
        messages=test_case["messages"],
        tools=test_case["tools"],
        mock_tool_responses=test_case["mock_tool_responses"],
        stream=stream,
    )

    if not turns:
        print_fail("No response from server.")
        return False

    parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
    if not parallel_ok:
        print_fail(parallel_reason)
        return False

    passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
    if passed:
        print_pass(f"{parallel_reason}; {reason}")
    else:
        print_fail(reason)
    return passed


# ---------------------------------------------------------------------------
# Test case definitions
# ---------------------------------------------------------------------------

# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----

_FILE_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "read_file",
            "description": (
                "Read the full contents of a file from the local filesystem. "
                "Call this tool in parallel when asked to read several files — "
                "each path needs its own call."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "path": {
                        "type": "string",
                        "description": "Absolute or repo-relative path to a file",
                    },
                },
                "required": ["path"],
            },
        },
    },
]

_FILE_CONTENTS = {
    "config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
    "config/redis.yml":    "host: cache.internal\nport: 6379\ndb: 0\n",
    "config/queue.yml":    "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
    "config/auth.yml":     "provider: oidc\nissuer: https://auth.internal\n",
}


def _read_file_mock(args):
    path = args.get("path", "")
    norm = path.lstrip("./").lstrip("/")
    content = _FILE_CONTENTS.get(norm)
    if content is None:
        for k, v in _FILE_CONTENTS.items():
            if path.endswith(k):
                content = v
                break
    if content is None:
        return json.dumps({"path": path, "error": "not found"})
    return json.dumps({"path": path, "content": content})


MULTIFILE_READ_TEST = {
    "name": "Parallel multi-file read (same tool, 4 distinct paths)",
    "tools": _FILE_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "Please read all four of these config files so I can review them "
                "together: config/database.yml, config/redis.yml, config/queue.yml, "
                "and config/auth.yml. Call read_file for every path in parallel in "
                "a single batch — do NOT read them one by one sequentially across "
                "turns. After you have all four, give me a one-line summary of each."
            ),
        }
    ],
    "mock_tool_responses": {"read_file": _read_file_mock},
    "expected_parallel": {
        "min_parallel": 4,
        "require_same_tool": "read_file",
        "min_distinct_args_key": "path",
        "min_distinct_args_count": 4,
    },
    "validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
}


def _validate_multifile(turns, tcs, content):
    del turns
    if not content:
        return False, "No final summary produced"
    return True, f"{len(tcs)} total read_file calls; content length={len(content)}"


# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----

_TODO_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "mark_todo_complete",
            "description": (
                "Mark a single TODO item as complete by ID. When the user wants "
                "several items marked at once, call this tool in parallel — "
                "one call per item — rather than sequentially across turns."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "todo_id": {
                        "type": "string",
                        "description": "Identifier of the TODO item",
                    },
                    "note": {
                        "type": "string",
                        "description": "Optional completion note",
                    },
                },
                "required": ["todo_id"],
            },
        },
    },
]

_TODO_DB = {
    "T-101": "Draft onboarding doc",
    "T-102": "Update dependency lockfile",
    "T-103": "Fix flaky login test",
    "T-104": "Rotate service credentials",
    "T-105": "Archive Q4 reports",
}


def _mark_todo_mock(args):
    tid = args.get("todo_id", "")
    if tid in _TODO_DB:
        return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
    return json.dumps({"todo_id": tid, "error": "unknown id"})


TODO_BATCH_TEST = {
    "name": "Batch TODO completion (same tool, 5 IDs in one turn)",
    "tools": _TODO_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "I finished every item on today's list. Please mark all of the "
                "following TODOs as complete, in one parallel batch: T-101, T-102, "
                "T-103, T-104, T-105. Don't mark them one at a time across separate "
                "turns — issue all five mark_todo_complete calls at once. Afterwards "
                "confirm which ones succeeded."
            ),
        }
    ],
    "mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
    "expected_parallel": {
        "min_parallel": 5,
        "require_same_tool": "mark_todo_complete",
        "min_distinct_args_key": "todo_id",
        "min_distinct_args_count": 5,
    },
    "validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
}


def _validate_todo(turns, tcs, content):
    del turns
    if not content:
        return False, "No confirmation summary produced"
    return True, f"{len(tcs)} total mark_todo_complete calls"


# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----

_WEATHER_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": (
                "Fetch current weather for ONE city. When the user asks about "
                "several cities, call this tool in parallel — one call per city — "
                "instead of sequentially."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "city":  {"type": "string", "description": "City name"},
                    "units": {
                        "type": "string",
                        "enum": ["metric", "imperial"],
                        "default": "metric",
                    },
                },
                "required": ["city"],
            },
        },
    },
]

_WEATHER_DB = {
    "tokyo":  {"city": "Tokyo",  "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
    "london": {"city": "London", "temp_c":  9.1, "condition": "overcast",       "humidity": 81},
    "new york": {"city": "New York", "temp_c": 12.7, "condition": "clear",      "humidity": 55},
    "paris":  {"city": "Paris",  "temp_c": 11.3, "condition": "light rain",     "humidity": 78},
}


def _weather_mock(args):
    city = args.get("city", "").strip().lower()
    if city.startswith("new york"):
        city = "new york"
    if city in _WEATHER_DB:
        return json.dumps(_WEATHER_DB[city])
    return json.dumps({"city": args.get("city", ""), "error": "unknown city"})


MULTI_WEATHER_TEST = {
    "name": "Parallel multi-city weather (same tool, 4 cities)",
    "tools": _WEATHER_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "I'm comparing today's weather across four cities for a travel "
                "decision: Tokyo, London, New York, and Paris. Please call "
                "get_weather for all four in parallel in a single turn — don't "
                "fetch them one at a time. Then rank them from warmest to coolest."
            ),
        }
    ],
    "mock_tool_responses": {"get_weather": _weather_mock},
    "expected_parallel": {
        "min_parallel": 4,
        "require_same_tool": "get_weather",
        "min_distinct_args_key": "city",
        "min_distinct_args_count": 4,
    },
    "validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
}


def _validate_weather(turns, tcs, content):
    del turns
    if not content or not any(
        kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
    ):
        return False, f"Final content missing a ranking: {content!r}"
    return True, f"{len(tcs)} total get_weather calls; ranking produced"


# ---- Test 4: Trip planning (different tools, parallel in one turn) ----

_TRIP_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_flights",
            "description": "Search one-way flights between two airports on a given date.",
            "parameters": {
                "type": "object",
                "properties": {
                    "from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
                    "to_airport":   {"type": "string", "description": "IATA code, e.g. JFK"},
                    "date":         {"type": "string", "description": "YYYY-MM-DD"},
                },
                "required": ["from_airport", "to_airport", "date"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search_hotels",
            "description": "Search hotels in a city for a date range.",
            "parameters": {
                "type": "object",
                "properties": {
                    "city":       {"type": "string"},
                    "check_in":   {"type": "string", "description": "YYYY-MM-DD"},
                    "check_out":  {"type": "string", "description": "YYYY-MM-DD"},
                    "max_price":  {"type": "integer"},
                },
                "required": ["city", "check_in", "check_out"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "search_restaurants",
            "description": "Search restaurants in a city by cuisine.",
            "parameters": {
                "type": "object",
                "properties": {
                    "city":    {"type": "string"},
                    "cuisine": {"type": "string"},
                },
                "required": ["city"],
            },
        },
    },
]

_FLIGHTS_RESULT = {
    "results": [
        {"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
        {"flight": "AA  20",  "depart": "10:00", "arrive": "18:35", "price": 412},
    ]
}
_HOTELS_RESULT = {
    "results": [
        {"name": "Midtown Grand",    "nightly_rate": 245, "rating": 4.3},
        {"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
    ]
}
_RESTAURANTS_RESULT = {
    "results": [
        {"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
        {"name": "Osteria Blu",    "cuisine": "italian", "rating": 4.4},
    ]
}

TRIP_PLAN_TEST = {
    "name": "Trip planning (3 different tools in parallel)",
    "tools": _TRIP_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
                "(check out 2026-06-16). I'd also like some Italian restaurant "
                "suggestions in New York. Please call search_flights, search_hotels, "
                "and search_restaurants in parallel — all three in a single turn, "
                "since they don't depend on each other. Then give me a concise "
                "travel summary."
            ),
        }
    ],
    "mock_tool_responses": {
        "search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
        "search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
        "search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
    },
    "expected_parallel": {
        "min_parallel": 3,
        "require_distinct_tools": 3,
    },
    "validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
}


def _validate_trip(turns, tcs, content):
    del turns
    names = {tc["function"]["name"] for tc in tcs}
    required = {"search_flights", "search_hotels", "search_restaurants"}
    missing = required - names
    if missing:
        return False, f"Missing tool calls: {missing}"
    if not content:
        return False, "No travel summary produced"
    return True, f"All three tools called; summary length={len(content)}"


# ---- Test 5: Portfolio check (same tool, parallel tickers) ----

_STOCK_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_stock_quote",
            "description": (
                "Get the latest quote for ONE ticker. When the user asks about "
                "multiple tickers, call this tool in parallel — one per symbol — "
                "rather than sequentially."
            ),
            "parameters": {
                "type": "object",
                "properties": {
                    "symbol": {"type": "string", "description": "Ticker symbol"},
                },
                "required": ["symbol"],
            },
        },
    },
]

_STOCK_DB = {
    "AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
    "MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
    "GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
    "AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
    "NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
}


def _stock_mock(args):
    sym = args.get("symbol", "").strip().upper()
    if sym in _STOCK_DB:
        return json.dumps(_STOCK_DB[sym])
    return json.dumps({"symbol": sym, "error": "unknown ticker"})


PORTFOLIO_TEST = {
    "name": "Portfolio check (same tool, 5 tickers in parallel)",
    "tools": _STOCK_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "Pull the latest quote for every ticker in my portfolio — AAPL, "
                "MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
                "lookups are independent, so please don't chain them across turns. "
                "Once you have all five, tell me which ticker had the biggest "
                "percentage change today."
            ),
        }
    ],
    "mock_tool_responses": {"get_stock_quote": _stock_mock},
    "expected_parallel": {
        "min_parallel": 5,
        "require_same_tool": "get_stock_quote",
        "min_distinct_args_key": "symbol",
        "min_distinct_args_count": 5,
    },
    "validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
}


def _validate_portfolio(turns, tcs, content):
    del turns
    if not content or ("nvda" not in content.lower() and "NVDA" not in content):
        return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
    return True, f"{len(tcs)} total quotes pulled"


# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----

_LANG_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "translate_text",
            "description": "Translate a short text into a target language.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text":            {"type": "string"},
                    "target_language": {"type": "string",
                                        "description": "ISO 639-1 language code, e.g. 'es'"},
                },
                "required": ["text", "target_language"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_definition",
            "description": "Get the English dictionary definition of a word.",
            "parameters": {
                "type": "object",
                "properties": {
                    "word": {"type": "string"},
                },
                "required": ["word"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_synonyms",
            "description": "Get English synonyms for a word.",
            "parameters": {
                "type": "object",
                "properties": {
                    "word": {"type": "string"},
                },
                "required": ["word"],
            },
        },
    },
]


def _translate_mock(args):
    t = args.get("text", "")
    lang = args.get("target_language", "")
    return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})


def _definition_mock(args):
    w = args.get("word", "")
    return json.dumps({
        "word": w,
        "definition": f"A standard dictionary definition of {w!r}.",
    })


def _synonyms_mock(args):
    w = args.get("word", "")
    return json.dumps({
        "word": w,
        "synonyms": ["synonym_a", "synonym_b", "synonym_c"],
    })


LANG_TOOLKIT_TEST = {
    "name": "Language toolkit (translate + definition + synonyms in parallel)",
    "tools": _LANG_TOOLS,
    "messages": [
        {
            "role": "user",
            "content": (
                "For the English word 'resilient', I need three independent "
                "look-ups at once: (a) translate it into Spanish, (b) fetch its "
                "dictionary definition, and (c) list its synonyms. These three "
                "calls don't depend on each other — please issue them in parallel "
                "in a single turn. Then present the combined results as a short "
                "language note."
            ),
        }
    ],
    "mock_tool_responses": {
        "translate_text":  _translate_mock,
        "get_definition":  _definition_mock,
        "get_synonyms":    _synonyms_mock,
    },
    "expected_parallel": {
        "min_parallel": 3,
        "require_distinct_tools": 3,
    },
    "validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
}


def _validate_lang(turns, tcs, content):
    del turns
    names = {tc["function"]["name"] for tc in tcs}
    required = {"translate_text", "get_definition", "get_synonyms"}
    missing = required - names
    if missing:
        return False, f"Missing tool calls: {missing}"
    if not content:
        return False, "No language note produced"
    return True, f"All three lookup tools called; note length={len(content)}"


# ---------------------------------------------------------------------------
# All test cases
# ---------------------------------------------------------------------------

ALL_TEST_CASES = [
    MULTIFILE_READ_TEST,
    TODO_BATCH_TEST,
    MULTI_WEATHER_TEST,
    TRIP_PLAN_TEST,
    PORTFOLIO_TEST,
    LANG_TOOLKIT_TEST,
]


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------


def main():
    parser = argparse.ArgumentParser(
        description=(
            "Test llama-server parallel tool-calling capability. Run this only "
            "against models configured for parallel tool calls — this script "
            "does not configure that itself."
        )
    )
    parser.add_argument("--host", default="localhost")
    parser.add_argument("--port", default=8080, type=int)
    parser.add_argument(
        "--no-stream", action="store_true", help="Disable streaming mode tests"
    )
    parser.add_argument(
        "--stream-only", action="store_true", help="Only run streaming mode tests"
    )
    parser.add_argument(
        "--test",
        help="Run only the test whose name contains this substring (case-insensitive)",
    )
    args = parser.parse_args()

    url = f"http://{args.host}:{args.port}/v1/chat/completions"
    print_info(f"Testing server at {url}")
    print_warn(
        "This script expects the target model to emit multiple tool calls in a "
        "single assistant turn. Run it only against parallel-tool-capable models."
    )

    modes: list[bool] = []
    if not args.stream_only:
        modes.append(False)
    if not args.no_stream:
        modes.append(True)

    cases: list[dict] = ALL_TEST_CASES
    if args.test:
        name_filter = args.test.lower()
        cases = [c for c in cases if name_filter in str(c["name"]).lower()]
        if not cases:
            print_fail(f"No test cases matched '{args.test}'")
            sys.exit(1)

    total = 0
    passed = 0
    for stream in modes:
        for case in cases:
            total += 1
            if run_test(url, case, stream=stream):
                passed += 1

    color = GREEN if passed == total else RED
    _print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
    _print(f"{BOLD}{color}  Results: {passed}/{total} passed{RESET}")
    _print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
    sys.exit(0 if passed == total else 1)


if __name__ == "__main__":
    main()