llama-eval : add per-task summary stats (#23151)

* llama-eval : add per-problem summary table to HTML reports

- Add chunk_idx and problem_idx to TaskState and saved case dicts
- Group completed cases by problem_idx in dump_html()
- Render per-problem summary table before individual task table
  - Columns: Problem (zero-padded), Runs, Correct (n/r),
    Tokens (min/avg/max), T/s (min/avg/max), Gen s (min/avg/max)
  - Sorted by problem index, monospace font, right-aligned numbers
  - Colspan headers for grouped stats, auto width
- Simulator: add /v1/models endpoint, timings in response,
  template-aware question matching, --dataset arg (aime/aime2025)

Assisted-by: llama.cpp:local pi

* llama-eval : add tabs for Detailed and Summary tables, apply monospace font globally

- Wrap Detailed and Summary tables in switchable tabs (Detailed active by default)
- Remove summary-section wrapper, use tab labels instead
- Apply monospace font to all tables and the top bar

Assisted-by: llama.cpp:local pi

* llama-eval : redesign top bar as CSS grid label/value pairs

- Replace flat span list with 4-column grid layout (2 pairs per row)
- Labels in muted color (#888), values in dark (#222)
- Bold dataset name and model name
- Removed media query, always uses 4 columns

Assisted-by: llama.cpp:local pi

* llama-eval : use realistic token counts and throughput in simulator

- comp_tokens: [30, 80] → [10000, 60000]
- tps_gen: derived → uniform [90.0, 110.0]
- t_gen_ms: now computed from tokens/tps

Assisted-by: llama.cpp:local pi

* llama-eval : color Answer column green/red based on correctness

Use the same .correct/.incorrect CSS classes on the Answer column
to make correct answers green and incorrect answers red.

Assisted-by: llama.cpp:local pi

* llama-eval : fix pyright errors from max(..., key=len) type inference

Use key=lambda x: len(x) instead of key=len so the type checker
infers the return type as str instead of Sized, fixing:
  - unresolved-attribute: Object of type Sized has no attribute lower
  - not-subscriptable: Cannot subscript object of type Sized

Assisted-by: llama.cpp:local pi
This commit is contained in:
Georgi Gerganov 2026-05-19 09:46:05 +03:00 committed by GitHub
parent c85a242ed0
commit d2e179a477
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 233 additions and 55 deletions

View file

@ -149,6 +149,8 @@ class TaskState:
t_gen_ms: Optional[float] = None
reasoning_content: Optional[str] = None
server_name: Optional[str] = None
chunk_idx: int = 0
problem_idx: int = 0
class EvalState:
@ -233,7 +235,9 @@ class EvalState:
tps_gen: Optional[float] = None,
t_gen_ms: Optional[float] = None,
reasoning_content: Optional[str] = None,
server_name: Optional[str] = None
server_name: Optional[str] = None,
chunk_idx: int = 0,
problem_idx: int = 0,
):
with self._lock:
if "cases" not in self.task_states:
@ -252,7 +256,9 @@ class EvalState:
"tps_gen": tps_gen,
"t_gen_ms": t_gen_ms,
"reasoning_content": reasoning_content,
"server_name": server_name
"server_name": server_name,
"chunk_idx": chunk_idx,
"problem_idx": problem_idx,
}
self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False))
@ -289,6 +295,9 @@ class EvalState:
all_cases = {}
for i, task_id in tasks_to_save:
question_text, prompt, expected = self.get_case(i)
# Extract chunk_idx from task_id for pending cases
_parts = task_id.rsplit("_", 2)
_chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
if task_id in self.task_states.get("cases", {}):
all_cases[task_id] = self.task_states["cases"][task_id]
else:
@ -306,7 +315,9 @@ class EvalState:
"tps_gen": None,
"t_gen_ms": None,
"reasoning_content": None,
"server_name": None
"server_name": None,
"chunk_idx": _chunk_idx,
"problem_idx": i,
}
ci_lower, ci_upper = self.accuracy_ci()
@ -382,11 +393,12 @@ class EvalState:
grader_log_str = self._escape_html(json.dumps(grader_log, indent=2))
escaped_server = self._escape_html(server_name)
answer_class = status_class if status == "ok" else ""
rows.append(f"""<tr class="task-row" onclick="toggleDetails('{task_id}')">
<td>{task_id}</td>
<td class="{status_class}">{status_text}</td>
<td>{self._escape_html(expected)}</td>
<td>{self._escape_html(answer)}</td>
<td class="{answer_class}">{self._escape_html(answer)}</td>
<td>{tokens_str}</td>
<td>{tps_str}</td>
<td>{t_gen_str}</td>
@ -405,6 +417,53 @@ class EvalState:
rows_html = "\n".join(rows)
# ---- per-problem summary table ----
problem_groups: Dict[int, List[Dict[str, Any]]] = {}
for _tid, _case in cases.items():
if _case.get("status") != "ok":
continue
_pidx = _case.get("problem_idx")
if _pidx is None:
_p_parts = _tid.rsplit("_", 2)
_pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0
problem_groups.setdefault(_pidx, []).append(_case)
summary_rows_html = ""
if problem_groups:
def _stat(v, fmt=".1f", avg_fmt=None):
if not v:
return ("", "", "")
af = fmt if avg_fmt is None else avg_fmt
return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}")
summary_data = []
for pidx, g in problem_groups.items():
runs = len(g)
n_ok = sum(1 for c in g if c.get("correct", False))
toks = [c["tokens"] for c in g if c.get("tokens") is not None]
tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None]
tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None]
summary_data.append((
pidx, runs, n_ok,
_stat(toks, "d", ".0f"),
_stat(tps),
_stat(tg),
))
summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending
summary_rows_html = "\n".join(
f"""<tr class="summary-row">
<td>{p:03d}</td>
<td>{r}</td>
<td>{n}/{r}</td>
<td>{tk[0]}</td><td>{tk[1]}</td><td>{tk[2]}</td>
<td>{tp[0]}</td><td>{tp[1]}</td><td>{tp[2]}</td>
<td>{tg[0]}</td><td>{tg[1]}</td><td>{tg[2]}</td>
</tr>"""
for p, r, n, tk, tp, tg in summary_data
)
html_content = f"""<!DOCTYPE html>
<html>
<head>
@ -412,10 +471,10 @@ class EvalState:
<title>{self.dataset_type.upper()} Eval</title>
<style>
body {{ font-family: system-ui, sans-serif; margin: 0; padding: 16px; background: #fff; color: #222; }}
.bar {{ padding: 8px 0; font-size: 14px; color: #555; }}
.bar span {{ margin-right: 20px; }}
.bar b {{ color: #222; }}
table {{ width: 100%; border-collapse: collapse; font-size: 13px; }}
.bar {{ padding: 8px 0; font-size: 13px; color: #555; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; display: grid; grid-template-columns: auto 1fr auto 1fr; gap: 2px 12px; align-items: baseline; }}
.bar .label {{ color: #888; }}
.bar .value {{ color: #222; }}
table {{ width: 100%; border-collapse: collapse; font-size: 13px; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; }}
th {{ text-align: left; padding: 6px 8px; border-bottom: 2px solid #ccc; font-weight: 600; }}
td {{ padding: 4px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
.task-row {{ cursor: pointer; }}
@ -429,37 +488,88 @@ class EvalState:
.details-content {{ padding: 8px 16px; background: #f6f8fa; font-size: 12px; }}
.details-content b {{ color: #555; }}
.details-content pre {{ background: #fff; border: 1px solid #e1e4e8; padding: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word; margin: 4px 0 8px; }}
.summary-table {{ margin-bottom: 16px; font-size: 13px; width: 100%; }}
.summary-row {{ background: #fafbfc; }}
.summary-row:hover {{ background: #f5f5f5; }}
.summary-table th {{ text-align: right; font-weight: 600; }}
.summary-table th:first-child {{ text-align: left; }}
.summary-table th[colspan] {{ text-align: center; }}
.summary-table td {{ text-align: right; }}
.summary-table td:first-child {{ text-align: left; }}
.tabs {{ display: flex; border-bottom: 2px solid #ddd; margin: 12px 0 0; }}
.tab-btn {{ padding: 6px 16px; border: none; background: none; font-size: 13px; cursor: pointer; color: #555; border-bottom: 2px solid transparent; margin-bottom: -2px; font-weight: 500; }}
.tab-btn:hover {{ color: #222; }}
.tab-btn.active {{ color: #222; border-bottom-color: #222; font-weight: 600; }}
.tab-content {{ display: none; }}
.tab-content.active {{ display: block; }}
</style>
</head>
<body>
<div class="bar">
<span><b>{self.dataset_type.upper()}</b></span>
<span>Model: {self.model_name or 'N/A'}</span>
<span>Accuracy: <b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</span>
<span>Correct: <span class="correct">{n_correct}</span> / {len(completed)}</span>
<span>Pending: {n_pending}</span>
<span>Time: {self.total_time:.1f}s</span>
<span>Sampling: {sampling_str}</span>
<div class="label">Dataset</div><div class="value"><b>{self.dataset_type.upper()}</b></div>
<div class="label">Model</div><div class="value"><b>{self.model_name or 'N/A'}</b></div>
<div class="label">Accuracy</div><div class="value"><b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</div>
<div class="label">Correct</div><div class="value"><span class="correct">{n_correct}</span> / {len(completed)}</div>
<div class="label">Pending</div><div class="value">{n_pending}</div>
<div class="label">Time</div><div class="value">{self.total_time:.1f}s</div>
<div class="label">Sampling</div><div class="value">{sampling_str}</div>
</div>
<div class="tabs">
<button class="tab-btn active" data-tab="detailed" onclick="switchTab(this)">Detailed</button>
<button class="tab-btn" data-tab="summary" onclick="switchTab(this)">Summary</button>
</div>
<div id="tab-detailed" class="tab-content active">
<table>
<thead>
<tr>
<th>ID</th>
<th></th>
<th>Gold</th>
<th>Answer</th>
<th>Tokens</th>
<th>T/s</th>
<th>Gen s</th>
<th>Server</th>
</tr>
</thead>
<tbody>
{rows_html}
</tbody>
</table>
</div>
<div id="tab-summary" class="tab-content">
<table class="summary-table">
<thead>
<tr>
<th>Problem</th>
<th>Runs</th>
<th>Correct</th>
<th colspan="3">Tokens</th>
<th colspan="3">T/s</th>
<th colspan="3">Gen s</th>
</tr>
<tr>
<th></th>
<th></th>
<th></th>
<th>min</th><th>avg</th><th>max</th>
<th>min</th><th>avg</th><th>max</th>
<th>min</th><th>avg</th><th>max</th>
</tr>
</thead>
<tbody>
{summary_rows_html}
</tbody>
</table>
</div>
<table>
<thead>
<tr>
<th>ID</th>
<th></th>
<th>Gold</th>
<th>Answer</th>
<th>Tokens</th>
<th>T/s</th>
<th>Gen s</th>
<th>Server</th>
</tr>
</thead>
<tbody>
{rows_html}
</tbody>
</table>
<script>
function toggleDetails(id) {{ document.getElementById('details-'+id).classList.toggle('open'); }}
function switchTab(btn) {{
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
btn.classList.add('active');
document.getElementById('tab-'+btn.dataset.tab).classList.add('active');
}}
</script>
</body>
</html>"""
@ -1062,12 +1172,19 @@ class Processor:
) -> TaskState:
question_text, prompt, expected = eval_state.get_case(i)
# Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}"
_parts = task_id.rsplit("_", 2)
chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
problem_idx = i
task_state = TaskState(
task_id=task_id,
prompt=prompt,
expected=expected,
question_text=question_text,
server_name=server_config.name
server_name=server_config.name,
chunk_idx=chunk_idx,
problem_idx=problem_idx,
)
try:
@ -1085,7 +1202,8 @@ class Processor:
eval_state.add_result(
task_id, prompt, expected, result, None,
{"finish_reason": finish_reason}, False, task_state.status,
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
chunk_idx, problem_idx,
)
eval_state.dump()
return task_state
@ -1108,7 +1226,8 @@ class Processor:
eval_state.add_result(
task_id, prompt, expected, result, answer,
grader_log, is_correct, "ok",
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
chunk_idx, problem_idx,
)
eval_state.dump()

View file

@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]:
return int(match.group(0))
class AimeDataset:
def __init__(self, split: str = "train"):
def __init__(self, split: str = "train", dataset_type: str = "aime"):
self.split = split
self.dataset_type = dataset_type
self.questions: List[Dict] = []
self._load_dataset()
def _load_dataset(self):
print(f"Loading AIME dataset (split: {self.split})...")
def _get_question_text(self, question: Dict) -> str:
"""Get question text, handling different dataset field names."""
return question.get("problem", question.get("question", ""))
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
def _load_dataset(self):
if self.dataset_type == "aime":
print(f"Loading AIME dataset (split: {self.split})...")
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
else:
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
elif self.dataset_type == "aime2025":
print(f"Loading AIME2025 dataset...")
ds_list = []
for config_name in ["AIME2025-I", "AIME2025-II"]:
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
else:
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test")
ds_list.extend(ds)
ds = ds_list
else:
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
raise ValueError(f"Unknown dataset type: {self.dataset_type}")
self.questions = list(ds)
print(f"AIME dataset loaded: {len(self.questions)} questions")
print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions")
def find_question(self, request_text: str) -> Optional[Dict]:
# Strip common template prefixes to get the actual question text
# Templates include things like "Solve the following math problem step by step..."
# The actual question usually follows a blank line or after the template instruction
cleaned = request_text
# Split on double newline and take the part that looks like the problem
parts = cleaned.split('\n\n')
if len(parts) > 1:
# Find the part that's longest (likely the actual problem text)
problem_parts = [p for p in parts if len(p.strip()) > 100]
if problem_parts:
cleaned = max(problem_parts, key=lambda x: len(x))
best_match = None
best_distance = -1
best_index = -1
for i, question in enumerate(self.questions):
question_text = question["problem"]
request_lower = request_text.lower()
question_text = self._get_question_text(question)
request_lower = cleaned.lower()
question_lower = question_text.lower()
# Check if question text is contained in the cleaned request
if question_lower in request_lower or request_lower in question_lower:
debug_log(f"DEBUG: Found substring match at index {i}")
return question
# Exact match
if question_lower == request_lower:
debug_log(f"DEBUG: Found exact match at index {i}")
@ -118,7 +154,7 @@ class AimeDataset:
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
return best_match
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...")
return None
def get_answer(self, question: Dict) -> str:
@ -134,15 +170,16 @@ class Simulator:
port: int = 8033,
host: str = "localhost",
success_rate: float = 0.8,
dataset_split: str = "train"
dataset_split: str = "train",
dataset_type: str = "aime"
):
self.port = port
self.host = host
self.success_rate = success_rate
self.dataset = AimeDataset(dataset_split)
self.dataset = AimeDataset(dataset_split, dataset_type)
self.eval_state = EvalState(
id="aime-2025",
tasks=["aime"],
id=dataset_type,
tasks=[dataset_type],
task_states={},
sampling_config={"temperature": 0, "max_tokens": 2048}
)
@ -159,6 +196,10 @@ class Simulator:
else:
response_text = self._generate_wrong_answer(question)
comp_tokens = random.randint(10000, 60000)
tps_gen = random.uniform(90.0, 110.0)
t_gen_ms = comp_tokens / tps_gen * 1000
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
@ -176,8 +217,12 @@ class Simulator:
],
"usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
"completion_tokens": comp_tokens,
"total_tokens": 100 + comp_tokens
},
"timings": {
"predicted_ms": t_gen_ms,
"predicted_per_second": tps_gen
}
}
@ -218,6 +263,12 @@ class Simulator:
return response
class RequestHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/v1/models":
self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200)
return
self._send_json({"error": "Not found"}, 404)
def do_POST(self):
if self.path != "/v1/chat/completions":
self._send_json({"error": "Not found"}, 404)
@ -280,6 +331,13 @@ def main():
default=0.8,
help="Success rate 0-1 (default: 0.8)"
)
parser.add_argument(
"--dataset",
type=str,
default="aime",
choices=["aime", "aime2025"],
help="Dataset type (default: aime)"
)
parser.add_argument(
"--dataset-split",
type=str,
@ -294,7 +352,8 @@ def main():
port=args.port,
host=args.host,
success_rate=args.success_rate,
dataset_split=args.dataset_split
dataset_split=args.dataset_split,
dataset_type=args.dataset
)
server = HTTPServer((args.host, args.port), RequestHandler)
@ -304,7 +363,7 @@ def main():
print("\n=== llama-server-simulator ===")
print(f"Server running on http://{args.host}:{args.port}")
print(f"Success rate: {args.success_rate}")
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions")
print("\nPress Ctrl+C to stop\n")
try: