mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
1040 lines
36 KiB
Python
Executable file
1040 lines
36 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Test structured output capability via chat completions endpoint.
|
|
|
|
Each test case contains:
|
|
- response_format: OpenAI-compatible response_format specification.
|
|
Both "json_schema" and "json_object" are accepted; with
|
|
"json_object" a schema can be supplied via extra_body.
|
|
- extra_body (optional): dict of extra top-level request fields merged into
|
|
the request payload (mirrors the OpenAI SDK's extra_body
|
|
feature; llama.cpp reads a top-level "json_schema" here).
|
|
- messages: initial conversation messages
|
|
- tools (optional): tool definitions (for mixed tool + structured tests)
|
|
- mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON)
|
|
- apply_stage: "always" to apply response_format to every request,
|
|
"after_tools" to run the tool loop plain, then request a
|
|
structured summary in a follow-up user turn.
|
|
- followup (optional, for after_tools): user message appended before the
|
|
final structured call.
|
|
- validate: callable(parsed_json, tool_calls_history, raw_content) -> (passed: bool, reason: str)
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import requests
|
|
import sys
|
|
from typing import Any, cast
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Color / formatting helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
RESET = "\x1b[0m"
|
|
BOLD = "\x1b[1m"
|
|
DIM = "\x1b[2m"
|
|
CYAN = "\x1b[36m"
|
|
YELLOW = "\x1b[33m"
|
|
GREEN = "\x1b[32m"
|
|
RED = "\x1b[31m"
|
|
BLUE = "\x1b[34m"
|
|
WHITE = "\x1b[97m"
|
|
MAGENTA = "\x1b[35m"
|
|
|
|
|
|
def _print(text="", end="\n"):
|
|
sys.stdout.write(text + end)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def print_header(title):
|
|
bar = "─" * 60
|
|
_print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
|
|
_print(
|
|
f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
|
|
)
|
|
_print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
|
|
|
|
|
|
def print_tool_call(name, args):
|
|
args_str = json.dumps(args)
|
|
_print(
|
|
f"\n {BOLD}{YELLOW}⚙ tool call{RESET} {CYAN}{name}{RESET}{DIM}({args_str}){RESET}"
|
|
)
|
|
|
|
|
|
def print_tool_result(result):
|
|
preview = result[:160] + ("…" if len(result) > 160 else "")
|
|
_print(f" {DIM}{BLUE}↳ result{RESET} {DIM}{preview}{RESET}")
|
|
|
|
|
|
def print_model_output(text):
|
|
sys.stdout.write(text)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def print_pass(reason):
|
|
_print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}")
|
|
|
|
|
|
def print_fail(reason):
|
|
_print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}")
|
|
|
|
|
|
def print_info(msg):
|
|
_print(f"{DIM}{msg}{RESET}")
|
|
|
|
|
|
def print_schema_note(label, rf, extra_body=None):
|
|
kind = rf.get("type", "?")
|
|
name = ""
|
|
if kind == "json_schema":
|
|
name = rf.get("json_schema", {}).get("name", "")
|
|
elif kind == "json_object" and extra_body and "json_schema" in extra_body:
|
|
extra_schema = extra_body["json_schema"] or {}
|
|
name = extra_schema.get("title") or "extra_body.json_schema"
|
|
_print(f"{DIM}{MAGENTA} ⟐ response_format [{label}]: {kind}"
|
|
f"{(' / ' + name) if name else ''}{RESET}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def chat_completion(url, messages, tools=None, response_format=None, stream=False,
|
|
extra_body=None):
|
|
payload = {
|
|
"messages": messages,
|
|
"stream": stream,
|
|
"max_tokens": 8192,
|
|
}
|
|
if tools:
|
|
payload["tools"] = tools
|
|
payload["tool_choice"] = "auto"
|
|
if response_format is not None:
|
|
payload["response_format"] = response_format
|
|
if extra_body:
|
|
payload.update(extra_body)
|
|
|
|
try:
|
|
response = requests.post(url, json=payload, stream=stream)
|
|
response.raise_for_status()
|
|
except requests.exceptions.RequestException as e:
|
|
body = e.response.content if (e.response is not None) else b""
|
|
print_fail(f"Request error: {e} | body: {body}")
|
|
return None
|
|
|
|
full_content = ""
|
|
reasoning_content = ""
|
|
tool_calls: list[dict] = []
|
|
|
|
if stream:
|
|
for line in response.iter_lines():
|
|
if not line:
|
|
continue
|
|
decoded = line.decode("utf-8")
|
|
if not decoded.startswith("data: "):
|
|
continue
|
|
data_str = decoded[6:]
|
|
if data_str == "[DONE]":
|
|
break
|
|
try:
|
|
data = json.loads(data_str)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
choices = data.get("choices", [])
|
|
if not choices:
|
|
continue
|
|
delta = choices[0].get("delta", {})
|
|
if delta.get("reasoning_content"):
|
|
reasoning_content += delta["reasoning_content"]
|
|
if delta.get("content"):
|
|
full_content += delta["content"]
|
|
print_model_output(delta["content"])
|
|
for tc in delta.get("tool_calls", []):
|
|
idx = tc.get("index", 0)
|
|
while len(tool_calls) <= idx:
|
|
tool_calls.append(
|
|
{
|
|
"id": "",
|
|
"type": "function",
|
|
"function": {"name": "", "arguments": ""},
|
|
}
|
|
)
|
|
if "id" in tc:
|
|
tool_calls[idx]["id"] += tc["id"]
|
|
if "function" in tc:
|
|
if "name" in tc["function"]:
|
|
tool_calls[idx]["function"]["name"] += tc["function"]["name"]
|
|
if "arguments" in tc["function"]:
|
|
tool_calls[idx]["function"]["arguments"] += tc["function"][
|
|
"arguments"
|
|
]
|
|
else:
|
|
data = response.json()
|
|
choices = data.get("choices", [])
|
|
if choices:
|
|
msg = choices[0].get("message", {})
|
|
full_content = msg.get("content") or ""
|
|
reasoning_content = msg.get("reasoning_content") or ""
|
|
tool_calls = msg.get("tool_calls") or []
|
|
if full_content:
|
|
print_model_output(full_content)
|
|
|
|
result = {"content": full_content, "tool_calls": tool_calls}
|
|
if reasoning_content:
|
|
result["reasoning_content"] = reasoning_content
|
|
return result
|
|
|
|
|
|
def run_tool_loop(
|
|
url, messages, tools, mock_tool_responses, stream, response_format=None,
|
|
extra_body=None, max_turns=6,
|
|
):
|
|
"""
|
|
Drive the tool-call loop. If response_format is provided it is applied to
|
|
every request. Returns (all_tool_calls, final_messages, final_content).
|
|
"""
|
|
msgs = list(messages)
|
|
all_tool_calls: list[dict] = []
|
|
|
|
for _ in range(max_turns):
|
|
result = chat_completion(
|
|
url, msgs, tools=tools, response_format=response_format, stream=stream,
|
|
extra_body=extra_body,
|
|
)
|
|
if result is None:
|
|
return all_tool_calls, msgs, None
|
|
|
|
tcs = result.get("tool_calls") or []
|
|
content = result.get("content") or ""
|
|
|
|
if not tcs:
|
|
if content:
|
|
_print(f"\n{DIM}{'·' * 60}{RESET}")
|
|
return all_tool_calls, msgs, content
|
|
|
|
all_tool_calls.extend(tcs)
|
|
|
|
assistant_msg: dict = {
|
|
"role": "assistant",
|
|
"content": content,
|
|
"tool_calls": tcs,
|
|
}
|
|
reasoning = result.get("reasoning_content")
|
|
if reasoning:
|
|
assistant_msg["reasoning_content"] = reasoning
|
|
msgs.append(assistant_msg)
|
|
|
|
for tc in tcs:
|
|
tool_name = tc["function"]["name"]
|
|
try:
|
|
args = json.loads(tc["function"]["arguments"])
|
|
except json.JSONDecodeError:
|
|
args = {}
|
|
|
|
print_tool_call(tool_name, args)
|
|
|
|
mock_fn = mock_tool_responses.get(tool_name) if mock_tool_responses else None
|
|
if mock_fn:
|
|
tool_result = mock_fn(args)
|
|
else:
|
|
tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
|
|
|
|
print_tool_result(tool_result)
|
|
|
|
msgs.append(
|
|
{
|
|
"role": "tool",
|
|
"tool_call_id": tc.get("id", ""),
|
|
"content": tool_result,
|
|
}
|
|
)
|
|
|
|
return all_tool_calls, msgs, None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test case runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _try_parse_json(text):
|
|
"""Attempt to parse text as JSON, trimming common markdown fences."""
|
|
if text is None:
|
|
return None
|
|
stripped = text.strip()
|
|
if stripped.startswith("```"):
|
|
lines = stripped.splitlines()
|
|
if lines and lines[0].startswith("```"):
|
|
lines = lines[1:]
|
|
if lines and lines[-1].strip().startswith("```"):
|
|
lines = lines[:-1]
|
|
stripped = "\n".join(lines).strip()
|
|
try:
|
|
return json.loads(stripped)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
|
|
def run_test(url, test_case, stream):
|
|
name = test_case["name"]
|
|
mode = f"{'stream' if stream else 'non-stream'}"
|
|
apply_stage = test_case.get("apply_stage", "always")
|
|
print_header(f"{name} [{mode}] ({apply_stage})")
|
|
|
|
response_format = test_case["response_format"]
|
|
extra_body = test_case.get("extra_body")
|
|
print_schema_note(apply_stage, response_format, extra_body)
|
|
|
|
tools = test_case.get("tools")
|
|
mocks = test_case.get("mock_tool_responses") or {}
|
|
|
|
all_tcs: list[dict] = []
|
|
final_content = None
|
|
|
|
if apply_stage == "always":
|
|
all_tcs, _msgs, final_content = run_tool_loop(
|
|
url,
|
|
messages=list(test_case["messages"]),
|
|
tools=tools,
|
|
mock_tool_responses=mocks,
|
|
stream=stream,
|
|
response_format=response_format,
|
|
extra_body=extra_body,
|
|
)
|
|
elif apply_stage == "after_tools":
|
|
# Phase 1: plain tool loop, no response_format applied yet.
|
|
all_tcs, msgs, interim_content = run_tool_loop(
|
|
url,
|
|
messages=list(test_case["messages"]),
|
|
tools=tools,
|
|
mock_tool_responses=mocks,
|
|
stream=stream,
|
|
response_format=None,
|
|
)
|
|
if interim_content:
|
|
msgs.append({"role": "assistant", "content": interim_content})
|
|
followup = test_case.get(
|
|
"followup",
|
|
"Now output the answer strictly as JSON matching the provided schema. "
|
|
"Do not include commentary.",
|
|
)
|
|
msgs.append({"role": "user", "content": followup})
|
|
|
|
# Phase 2: request final structured output. Tools are not passed so the
|
|
# model focuses on producing the schema-constrained answer.
|
|
_print(f"\n{DIM}{MAGENTA} ⟐ follow-up turn with response_format applied{RESET}")
|
|
result = chat_completion(
|
|
url, msgs, tools=None, response_format=response_format, stream=stream,
|
|
extra_body=extra_body,
|
|
)
|
|
final_content = result["content"] if result else None
|
|
else:
|
|
print_fail(f"Unknown apply_stage: {apply_stage}")
|
|
return False
|
|
|
|
if final_content is None:
|
|
print_fail("No final content from server.")
|
|
return False
|
|
|
|
parsed = _try_parse_json(final_content)
|
|
if parsed is None:
|
|
print_fail(f"Final content is not valid JSON: {final_content[:200]!r}")
|
|
return False
|
|
|
|
passed, reason = test_case["validate"](parsed, all_tcs, final_content)
|
|
if passed:
|
|
print_pass(reason)
|
|
else:
|
|
print_fail(reason)
|
|
return passed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test case definitions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# ---- Test 1: Book metadata extraction (always / json_schema) ----
|
|
|
|
_BOOK_SCHEMA = {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "book_metadata",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"title": {"type": "string"},
|
|
"author": {"type": "string"},
|
|
"year": {"type": "integer"},
|
|
"genre": {
|
|
"type": "string",
|
|
"enum": [
|
|
"fiction",
|
|
"non-fiction",
|
|
"fantasy",
|
|
"sci-fi",
|
|
"mystery",
|
|
"biography",
|
|
"history",
|
|
"other",
|
|
],
|
|
},
|
|
"page_count": {"type": "integer"},
|
|
},
|
|
"required": ["title", "author", "year", "genre", "page_count"],
|
|
},
|
|
},
|
|
}
|
|
|
|
BOOK_TEST_CASE = {
|
|
"name": "Book metadata extraction (json_schema, always)",
|
|
"response_format": _BOOK_SCHEMA,
|
|
"apply_stage": "always",
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"Extract book metadata from this description: "
|
|
"'Dune is a 1965 science fiction epic by Frank Herbert, spanning roughly "
|
|
"688 pages in its first edition, set on the desert planet Arrakis.' "
|
|
"Return the data as JSON."
|
|
),
|
|
}
|
|
],
|
|
"validate": lambda parsed, tcs, raw: _validate_book(parsed),
|
|
}
|
|
|
|
|
|
def _validate_book(parsed):
|
|
required = {"title", "author", "year", "genre", "page_count"}
|
|
missing = required - parsed.keys()
|
|
if missing:
|
|
return False, f"Missing fields: {missing}"
|
|
if not isinstance(parsed["title"], str) or not parsed["title"]:
|
|
return False, "title must be a non-empty string"
|
|
if not isinstance(parsed["author"], str) or "herbert" not in parsed["author"].lower():
|
|
return False, f"author unexpected: {parsed['author']!r}"
|
|
if not isinstance(parsed["year"], int) or parsed["year"] != 1965:
|
|
return False, f"year should be 1965, got {parsed['year']!r}"
|
|
if parsed["genre"] not in {
|
|
"fiction", "non-fiction", "fantasy", "sci-fi", "mystery",
|
|
"biography", "history", "other",
|
|
}:
|
|
return False, f"genre not in enum: {parsed['genre']!r}"
|
|
if not isinstance(parsed["page_count"], int) or parsed["page_count"] <= 0:
|
|
return False, f"page_count should be positive int: {parsed['page_count']!r}"
|
|
return True, f"Book: {parsed['title']} ({parsed['year']}) / {parsed['genre']}"
|
|
|
|
|
|
# ---- Test 2: Sentiment classification (always / enum-constrained) ----
|
|
|
|
_SENTIMENT_SCHEMA = {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "sentiment_analysis",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"sentiment": {
|
|
"type": "string",
|
|
"enum": ["positive", "negative", "neutral"],
|
|
},
|
|
"confidence": {"type": "number"},
|
|
"keywords": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"minItems": 1,
|
|
"maxItems": 5,
|
|
},
|
|
},
|
|
"required": ["sentiment", "confidence", "keywords"],
|
|
},
|
|
},
|
|
}
|
|
|
|
SENTIMENT_TEST_CASE = {
|
|
"name": "Sentiment analysis with enum and array",
|
|
"response_format": _SENTIMENT_SCHEMA,
|
|
"apply_stage": "always",
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"Analyse the sentiment of this review and return JSON with the "
|
|
"detected sentiment label, a confidence score between 0 and 1, "
|
|
"and up to five keyword strings that drove the classification:\n\n"
|
|
"'This product completely exceeded my expectations. The build "
|
|
"quality is phenomenal, it arrived a day early, and customer "
|
|
"support was delightful when I had a setup question.'"
|
|
),
|
|
}
|
|
],
|
|
"validate": lambda parsed, tcs, raw: _validate_sentiment(parsed),
|
|
}
|
|
|
|
|
|
def _validate_sentiment(parsed):
|
|
if parsed.get("sentiment") not in {"positive", "negative", "neutral"}:
|
|
return False, f"sentiment not in enum: {parsed.get('sentiment')!r}"
|
|
if parsed["sentiment"] != "positive":
|
|
return False, f"expected positive sentiment, got {parsed['sentiment']}"
|
|
conf = parsed.get("confidence")
|
|
if not isinstance(conf, (int, float)) or not (0.0 <= conf <= 1.0):
|
|
return False, f"confidence not in [0,1]: {conf!r}"
|
|
kws = parsed.get("keywords")
|
|
if not isinstance(kws, list) or not (1 <= len(kws) <= 5):
|
|
return False, f"keywords length out of range: {kws!r}"
|
|
if not all(isinstance(k, str) and k for k in kws):
|
|
return False, f"keywords must be non-empty strings: {kws!r}"
|
|
return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}"
|
|
|
|
|
|
# ---- Test: json_object + extra_body.json_schema (always) ----
|
|
#
|
|
# Exercises the llama.cpp-specific path where the OpenAI SDK would send
|
|
# response_format={"type": "json_object"} and tunnel the schema through
|
|
# extra_body.json_schema (which becomes a top-level "json_schema" field on
|
|
# the request body).
|
|
|
|
_PRODUCT_JSON_OBJECT_SCHEMA = {
|
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
"$id": "https://example.com/product.schema.json",
|
|
"title": "Product",
|
|
"description": "A product in the catalog",
|
|
"type": "object",
|
|
}
|
|
|
|
PRODUCT_JSON_OBJECT_TEST_CASE = {
|
|
"name": "json_object response_format with extra_body json_schema",
|
|
"response_format": {"type": "json_object"},
|
|
"extra_body": {"json_schema": _PRODUCT_JSON_OBJECT_SCHEMA},
|
|
"apply_stage": "always",
|
|
"messages": [
|
|
{
|
|
"role": "system",
|
|
"content": (
|
|
"Extract structured data from the provided text according to the "
|
|
"JSON schema. Return only valid JSON matching the schema exactly."
|
|
),
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": "Product: Wireless Headphones, ID: 101, In Stock: Yes",
|
|
},
|
|
],
|
|
"validate": lambda parsed, tcs, raw: _validate_product_json_object(parsed),
|
|
}
|
|
|
|
|
|
def _validate_product_json_object(parsed):
|
|
if not isinstance(parsed, dict):
|
|
return False, f"expected JSON object, got {type(parsed).__name__}: {parsed!r}"
|
|
if not parsed:
|
|
return False, f"expected non-empty object, got {parsed!r}"
|
|
return True, f"product object with {len(parsed)} field(s): {sorted(parsed.keys())}"
|
|
|
|
|
|
# ---- Test 3: Nested recipe schema (always) ----
|
|
|
|
_RECIPE_SCHEMA = {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "recipe",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"name": {"type": "string"},
|
|
"servings": {"type": "integer"},
|
|
"ingredients": {
|
|
"type": "array",
|
|
"minItems": 2,
|
|
"items": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"item": {"type": "string"},
|
|
"quantity": {"type": "string"},
|
|
},
|
|
"required": ["item", "quantity"],
|
|
},
|
|
},
|
|
"steps": {
|
|
"type": "array",
|
|
"minItems": 2,
|
|
"items": {"type": "string"},
|
|
},
|
|
"prep_time_minutes": {"type": "integer"},
|
|
},
|
|
"required": ["name", "servings", "ingredients", "steps", "prep_time_minutes"],
|
|
},
|
|
},
|
|
}
|
|
|
|
RECIPE_TEST_CASE = {
|
|
"name": "Nested recipe with arrays of objects",
|
|
"response_format": _RECIPE_SCHEMA,
|
|
"apply_stage": "always",
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"Give me a simple 4-serving scrambled eggs recipe as structured JSON. "
|
|
"Include the recipe name, servings, ingredients (each with item and "
|
|
"quantity), preparation steps, and total prep time in minutes."
|
|
),
|
|
}
|
|
],
|
|
"validate": lambda parsed, tcs, raw: _validate_recipe(parsed),
|
|
}
|
|
|
|
|
|
def _validate_recipe(parsed):
|
|
required = {"name", "servings", "ingredients", "steps", "prep_time_minutes"}
|
|
missing = required - parsed.keys()
|
|
if missing:
|
|
return False, f"Missing fields: {missing}"
|
|
if not isinstance(parsed["name"], str) or not parsed["name"]:
|
|
return False, "name must be a non-empty string"
|
|
if not isinstance(parsed["servings"], int) or parsed["servings"] <= 0:
|
|
return False, f"servings must be positive int: {parsed['servings']!r}"
|
|
ings = parsed["ingredients"]
|
|
if not isinstance(ings, list) or len(ings) < 2:
|
|
return False, f"ingredients must be array of >=2: got {ings!r}"
|
|
for i, ing in enumerate(ings):
|
|
if not isinstance(ing, dict):
|
|
return False, f"ingredient[{i}] is not an object: {ing!r}"
|
|
ing_d = cast(dict[str, Any], ing)
|
|
item_val = ing_d.get("item")
|
|
qty_val = ing_d.get("quantity")
|
|
if item_val is None or qty_val is None:
|
|
return False, f"ingredient[{i}] missing item/quantity: {ing!r}"
|
|
if not isinstance(item_val, str) or not isinstance(qty_val, str):
|
|
return False, f"ingredient[{i}] fields must be strings: {ing!r}"
|
|
steps = parsed["steps"]
|
|
if not isinstance(steps, list) or len(steps) < 2:
|
|
return False, f"steps must be array of >=2 strings: got {steps!r}"
|
|
if not all(isinstance(s, str) and s for s in steps):
|
|
return False, "all steps must be non-empty strings"
|
|
pt = parsed["prep_time_minutes"]
|
|
if not isinstance(pt, int) or pt <= 0:
|
|
return False, f"prep_time_minutes must be positive int: {pt!r}"
|
|
return True, f"recipe '{parsed['name']}' with {len(ings)} ingredients, {len(steps)} steps"
|
|
|
|
|
|
# ---- Test 4: Tool call -> structured product comparison (after_tools) ----
|
|
|
|
_SHOP_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "search_products",
|
|
"description": "Search a product catalogue by keyword.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {"type": "string"},
|
|
},
|
|
"required": ["query"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_product_details",
|
|
"description": "Get detailed specs for a product by ID.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"product_id": {"type": "string"},
|
|
},
|
|
"required": ["product_id"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_SHOP_SEARCH_RESULT = {
|
|
"results": [
|
|
{"product_id": "LAP-001", "title": "AeroBook 13 Pro", "price": 1399.0, "rating": 4.7},
|
|
{"product_id": "LAP-002", "title": "QuantumSlim 14", "price": 1199.0, "rating": 4.4},
|
|
{"product_id": "LAP-003", "title": "NimbusWork Ultra 15", "price": 999.0, "rating": 4.2},
|
|
],
|
|
}
|
|
_SHOP_PRODUCT_DETAILS = {
|
|
"LAP-001": {
|
|
"product_id": "LAP-001",
|
|
"title": "AeroBook 13 Pro",
|
|
"cpu": "M-series 10-core",
|
|
"ram_gb": 16,
|
|
"storage_gb": 512,
|
|
"battery_hours": 18,
|
|
"weight_kg": 1.24,
|
|
"price": 1399.0,
|
|
},
|
|
"LAP-002": {
|
|
"product_id": "LAP-002",
|
|
"title": "QuantumSlim 14",
|
|
"cpu": "Core i7 12-core",
|
|
"ram_gb": 16,
|
|
"storage_gb": 512,
|
|
"battery_hours": 12,
|
|
"weight_kg": 1.35,
|
|
"price": 1199.0,
|
|
},
|
|
"LAP-003": {
|
|
"product_id": "LAP-003",
|
|
"title": "NimbusWork Ultra 15",
|
|
"cpu": "Ryzen 7 8-core",
|
|
"ram_gb": 16,
|
|
"storage_gb": 1024,
|
|
"battery_hours": 10,
|
|
"weight_kg": 1.70,
|
|
"price": 999.0,
|
|
},
|
|
}
|
|
|
|
|
|
def _shop_details_mock(args):
|
|
pid = args.get("product_id", "")
|
|
if pid in _SHOP_PRODUCT_DETAILS:
|
|
return json.dumps(_SHOP_PRODUCT_DETAILS[pid])
|
|
return json.dumps({"error": f"unknown product_id: {pid}"})
|
|
|
|
|
|
_SHOP_COMPARISON_SCHEMA = {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "laptop_comparison",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"recommendation": {"type": "string"},
|
|
"ranked_candidates": {
|
|
"type": "array",
|
|
"minItems": 2,
|
|
"items": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"product_id": {"type": "string"},
|
|
"title": {"type": "string"},
|
|
"score": {"type": "number"},
|
|
"reason": {"type": "string"},
|
|
},
|
|
"required": ["product_id", "title", "score", "reason"],
|
|
},
|
|
},
|
|
},
|
|
"required": ["recommendation", "ranked_candidates"],
|
|
},
|
|
},
|
|
}
|
|
|
|
SHOP_COMPARISON_TEST_CASE = {
|
|
"name": "Tool calls then structured laptop comparison (after_tools)",
|
|
"response_format": _SHOP_COMPARISON_SCHEMA,
|
|
"apply_stage": "after_tools",
|
|
"tools": _SHOP_TOOLS,
|
|
"mock_tool_responses": {
|
|
"search_products": lambda _: json.dumps(_SHOP_SEARCH_RESULT),
|
|
"get_product_details": _shop_details_mock,
|
|
},
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"I need a lightweight laptop for travel. Please search the catalogue "
|
|
"for 'ultraportable laptop', then fetch detailed specs for at least two "
|
|
"of the top candidates. Once you've gathered the data I'll ask you to "
|
|
"produce a structured comparison."
|
|
),
|
|
}
|
|
],
|
|
"followup": (
|
|
"Thanks. Now produce the final comparison strictly as JSON matching the "
|
|
"laptop_comparison schema: your single best recommendation (the product_id), "
|
|
"and a ranked_candidates array of at least two laptops, each with "
|
|
"product_id, title, a numeric score, and a short reason."
|
|
),
|
|
"validate": lambda parsed, tcs, raw: _validate_shop_comparison(parsed, tcs),
|
|
}
|
|
|
|
|
|
def _validate_shop_comparison(parsed, tcs):
|
|
names = [tc["function"]["name"] for tc in tcs]
|
|
if "search_products" not in names:
|
|
return False, f"expected search_products tool call, got {names}"
|
|
if "get_product_details" not in names:
|
|
return False, f"expected get_product_details tool call, got {names}"
|
|
if "recommendation" not in parsed or not isinstance(parsed["recommendation"], str):
|
|
return False, f"recommendation missing or not a string: {parsed!r}"
|
|
cands = parsed.get("ranked_candidates")
|
|
if not isinstance(cands, list) or len(cands) < 2:
|
|
return False, f"ranked_candidates must be >=2: {cands!r}"
|
|
valid_ids = set(_SHOP_PRODUCT_DETAILS.keys())
|
|
candidate_pids: list = []
|
|
for i, c in enumerate(cands):
|
|
if not isinstance(c, dict):
|
|
return False, f"candidate[{i}] not an object: {c!r}"
|
|
c_d = cast(dict[str, Any], c)
|
|
pid = c_d.get("product_id")
|
|
title = c_d.get("title")
|
|
score = c_d.get("score")
|
|
reason = c_d.get("reason")
|
|
for k, v in (("product_id", pid), ("title", title),
|
|
("score", score), ("reason", reason)):
|
|
if v is None:
|
|
return False, f"candidate[{i}] missing {k}: {c!r}"
|
|
if pid not in valid_ids:
|
|
return False, f"candidate[{i}].product_id not in catalogue: {pid!r}"
|
|
if not isinstance(score, (int, float)):
|
|
return False, f"candidate[{i}].score not numeric: {score!r}"
|
|
candidate_pids.append(pid)
|
|
recommendation = parsed["recommendation"]
|
|
if recommendation not in valid_ids and recommendation not in candidate_pids:
|
|
return False, f"recommendation {recommendation!r} not in candidates"
|
|
return True, (
|
|
f"tools={names}; recommended={parsed['recommendation']}; "
|
|
f"{len(cands)} ranked candidates"
|
|
)
|
|
|
|
|
|
# ---- Test 5: Multi-step research then structured report (after_tools) ----
|
|
|
|
_RESEARCH_TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_country_stats",
|
|
"description": "Fetch basic statistics for a country (population, GDP, capital).",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"country": {"type": "string"},
|
|
},
|
|
"required": ["country"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_climate_info",
|
|
"description": "Fetch climate information for a country.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"country": {"type": "string"},
|
|
},
|
|
"required": ["country"],
|
|
},
|
|
},
|
|
},
|
|
]
|
|
|
|
_COUNTRY_STATS = {
|
|
"norway": {
|
|
"country": "Norway",
|
|
"capital": "Oslo",
|
|
"population": 5_480_000,
|
|
"gdp_usd_trillion": 0.48,
|
|
"currency": "NOK",
|
|
}
|
|
}
|
|
_CLIMATE_INFO = {
|
|
"norway": {
|
|
"country": "Norway",
|
|
"climate_zone": "subarctic / temperate coastal",
|
|
"avg_winter_temp_c": -4.5,
|
|
"avg_summer_temp_c": 16.0,
|
|
"annual_precipitation_mm": 1400,
|
|
}
|
|
}
|
|
|
|
|
|
def _country_stats_mock(args):
|
|
c = args.get("country", "").strip().lower()
|
|
if c in _COUNTRY_STATS:
|
|
return json.dumps(_COUNTRY_STATS[c])
|
|
return json.dumps({"error": f"unknown country: {c}"})
|
|
|
|
|
|
def _climate_info_mock(args):
|
|
c = args.get("country", "").strip().lower()
|
|
if c in _CLIMATE_INFO:
|
|
return json.dumps(_CLIMATE_INFO[c])
|
|
return json.dumps({"error": f"unknown country: {c}"})
|
|
|
|
|
|
_RESEARCH_REPORT_SCHEMA = {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "country_report",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"properties": {
|
|
"country": {"type": "string"},
|
|
"capital": {"type": "string"},
|
|
"population": {"type": "integer"},
|
|
"climate_summary": {"type": "string"},
|
|
"highlights": {
|
|
"type": "array",
|
|
"minItems": 2,
|
|
"maxItems": 5,
|
|
"items": {"type": "string"},
|
|
},
|
|
"suitable_for_tourism": {"type": "boolean"},
|
|
},
|
|
"required": [
|
|
"country", "capital", "population",
|
|
"climate_summary", "highlights", "suitable_for_tourism",
|
|
],
|
|
},
|
|
},
|
|
}
|
|
|
|
COUNTRY_REPORT_TEST_CASE = {
|
|
"name": "Research pipeline then structured country report (after_tools)",
|
|
"response_format": _RESEARCH_REPORT_SCHEMA,
|
|
"apply_stage": "after_tools",
|
|
"tools": _RESEARCH_TOOLS,
|
|
"mock_tool_responses": {
|
|
"get_country_stats": _country_stats_mock,
|
|
"get_climate_info": _climate_info_mock,
|
|
},
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
"I'm preparing a short briefing on Norway. Please call the "
|
|
"get_country_stats and get_climate_info tools to gather data "
|
|
"first. Afterwards I'll ask for a structured summary."
|
|
),
|
|
}
|
|
],
|
|
"followup": (
|
|
"Based on the tool results, produce the briefing as JSON matching the "
|
|
"country_report schema. Populate every required field and provide between "
|
|
"two and five highlights."
|
|
),
|
|
"validate": lambda parsed, tcs, raw: _validate_country_report(parsed, tcs),
|
|
}
|
|
|
|
|
|
def _validate_country_report(parsed, tcs):
|
|
names = [tc["function"]["name"] for tc in tcs]
|
|
for required_tool in ("get_country_stats", "get_climate_info"):
|
|
if required_tool not in names:
|
|
return False, f"missing tool call {required_tool!r}: got {names}"
|
|
required = {
|
|
"country", "capital", "population",
|
|
"climate_summary", "highlights", "suitable_for_tourism",
|
|
}
|
|
missing = required - parsed.keys()
|
|
if missing:
|
|
return False, f"missing report fields: {missing}"
|
|
if "norway" not in parsed["country"].lower():
|
|
return False, f"country should reference Norway: {parsed['country']!r}"
|
|
if "oslo" not in parsed["capital"].lower():
|
|
return False, f"capital should be Oslo: {parsed['capital']!r}"
|
|
if not isinstance(parsed["population"], int) or parsed["population"] < 1_000_000:
|
|
return False, f"population implausible: {parsed['population']!r}"
|
|
if not isinstance(parsed["climate_summary"], str) or not parsed["climate_summary"]:
|
|
return False, "climate_summary must be a non-empty string"
|
|
hls = parsed["highlights"]
|
|
if not isinstance(hls, list) or not (2 <= len(hls) <= 5):
|
|
return False, f"highlights length out of range: {hls!r}"
|
|
if not all(isinstance(h, str) and h for h in hls):
|
|
return False, "each highlight must be a non-empty string"
|
|
if not isinstance(parsed["suitable_for_tourism"], bool):
|
|
return False, f"suitable_for_tourism must be bool: {parsed['suitable_for_tourism']!r}"
|
|
return True, (
|
|
f"tools={names}; report for {parsed['country']} "
|
|
f"(pop {parsed['population']}, {len(hls)} highlights)"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# All test cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ALL_TEST_CASES = [
|
|
BOOK_TEST_CASE,
|
|
SENTIMENT_TEST_CASE,
|
|
PRODUCT_JSON_OBJECT_TEST_CASE,
|
|
RECIPE_TEST_CASE,
|
|
SHOP_COMPARISON_TEST_CASE,
|
|
COUNTRY_REPORT_TEST_CASE,
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Test llama-server structured-output capability."
|
|
)
|
|
parser.add_argument("--host", default="localhost")
|
|
parser.add_argument("--port", default=8080, type=int)
|
|
parser.add_argument(
|
|
"--no-stream", action="store_true", help="Disable streaming mode tests"
|
|
)
|
|
parser.add_argument(
|
|
"--stream-only", action="store_true", help="Only run streaming mode tests"
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
help="Run only the test whose name contains this substring (case-insensitive)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
url = f"http://{args.host}:{args.port}/v1/chat/completions"
|
|
print_info(f"Testing server at {url}")
|
|
|
|
modes: list[bool] = []
|
|
if not args.stream_only:
|
|
modes.append(False)
|
|
if not args.no_stream:
|
|
modes.append(True)
|
|
|
|
cases: list[dict] = ALL_TEST_CASES
|
|
if args.test:
|
|
name_filter = args.test.lower()
|
|
cases = [c for c in cases if name_filter in str(c["name"]).lower()]
|
|
if not cases:
|
|
print_fail(f"No test cases matched '{args.test}'")
|
|
sys.exit(1)
|
|
|
|
total = 0
|
|
passed = 0
|
|
for stream in modes:
|
|
for case in cases:
|
|
total += 1
|
|
if run_test(url, case, stream=stream):
|
|
passed += 1
|
|
|
|
color = GREEN if passed == total else RED
|
|
_print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
|
|
_print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}")
|
|
_print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
|
|
sys.exit(0 if passed == total else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|