benchmark: update benchmark (#1207)

Co-authored-by: bytecii <bytecii@users.noreply.github.com>
Co-authored-by: Wendong-Fan <w3ndong.fan@gmail.com>
Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
This commit is contained in:
bytecii 2026-02-12 00:35:18 -08:00 committed by GitHub
parent c5ec78e5f9
commit f7bf29a40a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 840 additions and 35 deletions

View file

@ -39,6 +39,7 @@ jobs:
app/middleware \
app/model \
app/service \
benchmark \
tests/app \
-type f ! -path '*__pycache__*') \
app/__init__.py \

View file

@ -7,3 +7,4 @@ README_PT-BR.md
server/README_CN.md
server/README_EN.md
docs/troubleshooting/bug.md
backend/benchmark/answer/

View file

@ -38,8 +38,10 @@ repos:
- id: ruff
name: Ruff lint (auto-fix)
args: [--fix]
exclude: 'benchmark/answer/'
- id: ruff-format
name: Ruff format
exclude: 'benchmark/answer/'
# Security scanning
- repo: https://github.com/PyCQA/bandit
@ -56,6 +58,7 @@ repos:
hooks:
- id: mdformat
name: Format Markdown
exclude: 'benchmark/answer/'
additional_dependencies:
- mdformat-gfm
- mdformat_frontmatter

View file

@ -0,0 +1,4 @@
BENCHMARK_MODEL_PLATFORM="openai"
BENCHMARK_MODEL_TYPE="gpt-5.2"
BENCHMARK_API_KEY=""
BENCHMARK_API_URL="https://api.openai.com/v1"

View file

@ -76,7 +76,29 @@ The `metadata` field (optional) provides information about the benchmark:
- `description`: Brief explanation of what skills or capabilities the benchmark tests
- `tags`: Array of keywords for filtering and organization
`model_platform` and `model_type` default to `"openai"` and `"gpt-4o"`. `api_key` defaults to `$OPENAI_API_KEY`. Set `api_url` for custom endpoints.
The `model_kwargs` field is optional. Defaults come from `BENCHMARK_*` environment variables (see below), falling back to `openai` / `gpt-5.2` / `$OPENAI_API_KEY`. Per-benchmark JSON values override the environment defaults.
### Custom model providers
You can override the model for all benchmarks via environment variables (see `.env.example`):
```bash
export BENCHMARK_MODEL_PLATFORM="openai-compatible-model"
export BENCHMARK_MODEL_TYPE=""
export BENCHMARK_API_KEY=""
export BENCHMARK_API_URL=""
```
| Variable | Default | Description |
| -------------------------- | --------------------------- | --------------------------------------------------------------------------- |
| `BENCHMARK_MODEL_PLATFORM` | `openai` | Provider name. Use `openai-compatible-model` for any OpenAI-compatible API. |
| `BENCHMARK_MODEL_TYPE` | `gpt-5.2` | Model identifier passed to the provider. |
| `BENCHMARK_API_KEY` | `$OPENAI_API_KEY` | API key for the provider. |
| `BENCHMARK_API_URL` | `https://api.openai.com/v1` | Base URL for the provider's API. |
> **Important:** If the model is served through an OpenAI-compatible API (e.g. DeepSeek, MiniMax, Ollama, vLLM, LiteLLM, or any other non-OpenAI provider), set `BENCHMARK_MODEL_PLATFORM` to `openai-compatible-model`**not** `openai`. The `openai` platform value is reserved for the official OpenAI API only.
To override a single benchmark, add `model_kwargs` to its JSON config — these take priority over environment variables.
2. Create `benchmark/checker/<n>.py` with a `check(working_directory: str) -> bool` function.

View file

@ -11,4 +11,3 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========

View file

@ -0,0 +1,25 @@
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
@lambda _: _()
class _:
def __format__(_, __):
_.__class__._ = property(lambda _: print(__))
return ""
def __() -> f"{_:Hello, WORLD!}": ...
_._

View file

@ -0,0 +1,7 @@
# warnings
PEP 702: The new `warnings.deprecated()` decorator provides a way to communicate deprecations to a static type checker and to warn on usage of deprecated classes and functions. A `DeprecationWarning` may also be emitted when a decorated function or class is used at runtime. (Contributed by Jelle Zijlstra in `gh-104003`.)
# multiprocessing
The default number of worker threads and processes is now selected using `os.process_cpu_count()` instead of `os.cpu_count()`. (Contributed by Victor Stinner in `gh-109649`.)

View file

@ -0,0 +1,77 @@
company_name,product_description,ai_category
fira,Agentic AI platform for investment firms,ai-fintech
assistant-ui,Open-source React.js library for AI chat,ai-developer-tools
artifact,Collaborative AI-native IDE for hardware engineers,ai-developer-tools
axal,AI observability for modular codebase architecture,ai-developer-tools
trainloop,Reasoning fine-tuning platform for AI models,ai-infrastructure
tally,AI agents for accounting firms automating repetitive tasks,ai-agents
sammy labs,AI that maps every click path in software for user onboarding,ai-customer-support
mercura,AI quoting for distributors and manufacturers,ai-sales
cedar,In-product AI copilot for any app,ai-productivity
browser use,Open-source web agents automating browser workflows,ai-agents
tamlabs,AI-native document editor for Microsoft Word,ai-productivity
copycat,Next-gen RPA powered by browser agents,ai-agents
wildcard,Make APIs work for AI agents,ai-infrastructure
mastra,JavaScript framework for building AI agents,ai-developer-tools
afterquery,High-quality datasets and benchmarks for AI model training,ai-data
fuse ai,AI agents to replace Salesforce,ai-sales
peppr,Self-improving knowledge base synthesizing company data,ai-productivity
sennu ai,AI agents automating the tech consulting market,ai-agents
mesh,AI finance co-worker providing real-time insights,ai-fintech
outlit,AI agents for enterprise deal creation,ai-sales
tire swing,AI for healthcare compliance,ai-healthcare
calltree ai,Enterprise-grade AI support reps for call centers,ai-customer-support
operand,B2B knowledge management platform with AI search,ai-data
gulp information services,Real-time self-improvement infrastructure for AI agents,ai-infrastructure
zeroentropy,High accuracy search API over unstructured data,ai-infrastructure
cardamon,AI compliance co-pilot for regulated financial businesses,ai-fintech
tergle,AI agents for audit workflows,ai-fintech
carecycle,Voice AI teams for Medicare agencies,ai-customer-support
sift dev,AI-powered fraud decisioning for digital businesses,ai-security
maive,AI-native manufacturing execution system for factory operations,ai-other
weave,AI to measure and analyze engineering work,ai-analytics
caseflood,AI inbound sales team for law firms,ai-legal
tejas ai,Risk decisioning platform for banks powered by AI,ai-fintech
vora ai,AI recruiter for hiring managers,ai-hr
a0.dev,AI-powered mobile app builder,ai-coding
general agency company,AI coworkers that can learn and act like humans,ai-agents
a1base,Twilio for AI agents,ai-infrastructure
verbiflow,AI-powered CRM that finds leads and closes deals,ai-sales
contrario,Fully autonomous AI recruiting agency,ai-hr
ovlo,Conversational AI for e-commerce sales,ai-sales
truffle ai,AWS for AI agents,ai-infrastructure
superglue,Self-healing integration agent for enterprise workflows,ai-infrastructure
conntour,AI to monitor thousands of security cameras,ai-security
promptless,AI teammate that auto-updates customer-facing docs,ai-productivity
stamp,AI-native email client for professionals,ai-productivity
guse,Prompt-to-automation platform for business workflows,ai-agents
subimage,AI-powered infrastructure mapping and security platform,ai-security
casixty,Reddit marketing agent for technical audiences,ai-marketing
leaping ai,Self-improving voice AI agents for call center automation,ai-customer-support
vetnio,AI copilot automating admin work for veterinary pros,ai-healthcare
trace,Voice AI customer support for financial services,ai-customer-support
quantstruct,AI documentation engineer for product docs,ai-developer-tools
onlook,AI-powered visual editor for designers,ai-developer-tools
pig,API for automating Windows apps with AI,ai-developer-tools
vantel,AI software for commercial insurance brokers,ai-fintech
agentin ai,AI agents automating enterprise software processes,ai-agents
solidroad,AI agents for sales and support team training,ai-customer-support
trata,AI-powered research desk for hedge funds,ai-analytics
sophris,AI engineer for electronic design automation,ai-developer-tools
mundo ai,High quality multilingual training data for AI models,ai-data
athenahq,AI-powered brand discovery optimization for ChatGPT,ai-marketing
lopus ai,AI agents for revenue intelligence,ai-sales
harbera,AI healthcare provider credentialing software,ai-healthcare
augento,Improving AI agents through reinforcement learning,ai-infrastructure
macadamia,AI mechanical engineer that detects and fixes design errors,ai-other
asteroid,Browser agents for regulated industries,ai-agents
gale,AI-powered immigration law firm,ai-legal
olive,Build internal tools with natural language and AI,ai-developer-tools
cuckoo labs,Real-time AI translator for sales and marketing teams,ai-marketing
mosaic,AI agents for video editing workflows,ai-agents
oki,Track company progress with AI analytics,ai-analytics
amby health,AI copilot for ambulance agencies,ai-healthcare
g lnk,AI collaboration platform for healthcare organizations,ai-healthcare
artificial societies,AI simulation of target audiences for marketing predictions,ai-marketing
overstand labs,AI insights from customer communications across channels,ai-analytics
lucidic ai,Analytics and simulation tools for AI agents,ai-analytics
1 company_name product_description ai_category
2 fira Agentic AI platform for investment firms ai-fintech
3 assistant-ui Open-source React.js library for AI chat ai-developer-tools
4 artifact Collaborative AI-native IDE for hardware engineers ai-developer-tools
5 axal AI observability for modular codebase architecture ai-developer-tools
6 trainloop Reasoning fine-tuning platform for AI models ai-infrastructure
7 tally AI agents for accounting firms automating repetitive tasks ai-agents
8 sammy labs AI that maps every click path in software for user onboarding ai-customer-support
9 mercura AI quoting for distributors and manufacturers ai-sales
10 cedar In-product AI copilot for any app ai-productivity
11 browser use Open-source web agents automating browser workflows ai-agents
12 tamlabs AI-native document editor for Microsoft Word ai-productivity
13 copycat Next-gen RPA powered by browser agents ai-agents
14 wildcard Make APIs work for AI agents ai-infrastructure
15 mastra JavaScript framework for building AI agents ai-developer-tools
16 afterquery High-quality datasets and benchmarks for AI model training ai-data
17 fuse ai AI agents to replace Salesforce ai-sales
18 peppr Self-improving knowledge base synthesizing company data ai-productivity
19 sennu ai AI agents automating the tech consulting market ai-agents
20 mesh AI finance co-worker providing real-time insights ai-fintech
21 outlit AI agents for enterprise deal creation ai-sales
22 tire swing AI for healthcare compliance ai-healthcare
23 calltree ai Enterprise-grade AI support reps for call centers ai-customer-support
24 operand B2B knowledge management platform with AI search ai-data
25 gulp information services Real-time self-improvement infrastructure for AI agents ai-infrastructure
26 zeroentropy High accuracy search API over unstructured data ai-infrastructure
27 cardamon AI compliance co-pilot for regulated financial businesses ai-fintech
28 tergle AI agents for audit workflows ai-fintech
29 carecycle Voice AI teams for Medicare agencies ai-customer-support
30 sift dev AI-powered fraud decisioning for digital businesses ai-security
31 maive AI-native manufacturing execution system for factory operations ai-other
32 weave AI to measure and analyze engineering work ai-analytics
33 caseflood AI inbound sales team for law firms ai-legal
34 tejas ai Risk decisioning platform for banks powered by AI ai-fintech
35 vora ai AI recruiter for hiring managers ai-hr
36 a0.dev AI-powered mobile app builder ai-coding
37 general agency company AI coworkers that can learn and act like humans ai-agents
38 a1base Twilio for AI agents ai-infrastructure
39 verbiflow AI-powered CRM that finds leads and closes deals ai-sales
40 contrario Fully autonomous AI recruiting agency ai-hr
41 ovlo Conversational AI for e-commerce sales ai-sales
42 truffle ai AWS for AI agents ai-infrastructure
43 superglue Self-healing integration agent for enterprise workflows ai-infrastructure
44 conntour AI to monitor thousands of security cameras ai-security
45 promptless AI teammate that auto-updates customer-facing docs ai-productivity
46 stamp AI-native email client for professionals ai-productivity
47 guse Prompt-to-automation platform for business workflows ai-agents
48 subimage AI-powered infrastructure mapping and security platform ai-security
49 casixty Reddit marketing agent for technical audiences ai-marketing
50 leaping ai Self-improving voice AI agents for call center automation ai-customer-support
51 vetnio AI copilot automating admin work for veterinary pros ai-healthcare
52 trace Voice AI customer support for financial services ai-customer-support
53 quantstruct AI documentation engineer for product docs ai-developer-tools
54 onlook AI-powered visual editor for designers ai-developer-tools
55 pig API for automating Windows apps with AI ai-developer-tools
56 vantel AI software for commercial insurance brokers ai-fintech
57 agentin ai AI agents automating enterprise software processes ai-agents
58 solidroad AI agents for sales and support team training ai-customer-support
59 trata AI-powered research desk for hedge funds ai-analytics
60 sophris AI engineer for electronic design automation ai-developer-tools
61 mundo ai High quality multilingual training data for AI models ai-data
62 athenahq AI-powered brand discovery optimization for ChatGPT ai-marketing
63 lopus ai AI agents for revenue intelligence ai-sales
64 harbera AI healthcare provider credentialing software ai-healthcare
65 augento Improving AI agents through reinforcement learning ai-infrastructure
66 macadamia AI mechanical engineer that detects and fixes design errors ai-other
67 asteroid Browser agents for regulated industries ai-agents
68 gale AI-powered immigration law firm ai-legal
69 olive Build internal tools with natural language and AI ai-developer-tools
70 cuckoo labs Real-time AI translator for sales and marketing teams ai-marketing
71 mosaic AI agents for video editing workflows ai-agents
72 oki Track company progress with AI analytics ai-analytics
73 amby health AI copilot for ambulance agencies ai-healthcare
74 g lnk AI collaboration platform for healthcare organizations ai-healthcare
75 artificial societies AI simulation of target audiences for marketing predictions ai-marketing
76 overstand labs AI insights from customer communications across channels ai-analytics
77 lucidic ai Analytics and simulation tools for AI agents ai-analytics

View file

@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Checker for benchmark 0: hello_world.py should print 'Hello, World!'"""
"""Checker for benchmark 0: hello_world.py should print 'Hello, WORLD!'"""
import subprocess
import sys
@ -33,11 +33,11 @@ def check(working_directory: str) -> bool:
)
output = result.stdout.strip()
if output == "Hello, World!":
if output == "Hello, WORLD!":
print("PASS")
return True
else:
print(f"FAIL: expected 'Hello, World!', got '{output}'")
print(f"FAIL: expected 'Hello, WORLD!', got '{output}'")
return False

View file

@ -0,0 +1,61 @@
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Checker for benchmark 1: python313_features.md with warnings and
multiprocessing sections."""
import re
import sys
from pathlib import Path
def check(working_directory: str) -> bool:
md_file = Path(working_directory) / "python313_features.md"
if not md_file.exists():
print(f"FAIL: {md_file} does not exist")
return False
content = md_file.read_text()
if len(content.strip()) < 50:
print("FAIL: file content is too short")
return False
# Check for at least 2 heading sections (# warnings, # multiprocessing)
h1_sections = re.findall(r"^# .+", content, re.MULTILINE)
if len(h1_sections) < 2:
print(
f"FAIL: expected at least 2 # sections, found {len(h1_sections)}"
)
return False
lower = content.lower()
if "warnings" not in lower:
print("FAIL: missing warnings section")
return False
if "multiprocessing" not in lower:
print("FAIL: missing multiprocessing section")
return False
print("PASS")
return True
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <working_directory>")
sys.exit(1)
success = check(sys.argv[1])
sys.exit(0 if success else 1)

View file

@ -0,0 +1,92 @@
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Checker for benchmark 2: yc_w25_b2b_ai.csv with B2B AI companies."""
import csv
import sys
from pathlib import Path
VALID_CATEGORIES = {
"ai-agents",
"ai-infrastructure",
"ai-developer-tools",
"ai-analytics",
"ai-security",
"ai-healthcare",
"ai-sales",
"ai-productivity",
"ai-customer-support",
"ai-coding",
"ai-data",
"ai-fintech",
"ai-legal",
"ai-hr",
"ai-marketing",
"ai-other",
}
REQUIRED_COLUMNS = {"company_name", "product_description", "ai_category"}
def check(working_directory: str) -> bool:
csv_file = Path(working_directory) / "yc_w25_b2b_ai.csv"
if not csv_file.exists():
print(f"FAIL: {csv_file} does not exist")
return False
with open(csv_file, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
headers = set(reader.fieldnames or [])
missing = REQUIRED_COLUMNS - headers
if missing:
print(f"FAIL: missing columns: {missing}")
return False
rows = list(reader)
if len(rows) < 5:
print(f"FAIL: expected at least 5 companies, got {len(rows)}")
return False
for i, row in enumerate(rows):
name = row.get("company_name", "")
if name != name.lower():
print(f"FAIL: row {i}: company_name '{name}' is not lowercase")
return False
desc = row.get("product_description", "")
if len(desc) > 100:
print(
f"FAIL: row {i}: product_description exceeds 100 chars "
f"({len(desc)})"
)
return False
cat = row.get("ai_category", "")
if cat not in VALID_CATEGORIES:
print(f"FAIL: row {i}: invalid ai_category '{cat}'")
return False
print("PASS")
return True
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <working_directory>")
sys.exit(1)
success = check(sys.argv[1])
sys.exit(0 if success else 1)

View file

@ -1,18 +1,20 @@
{
"metadata": {
"difficulty": "easy",
"description": "Google a specific blog post on mathspp.com about obfuscated Python, read and understand the code tricks, then faithfully reproduce the exact program as hello_world.py.",
"tags": ["browser", "coding", "python", "target-searching"]
"description": "1) search tool usage and choosing the appropriate website from results, 2) interpreting advanced obfuscated Python code patterns (requires deep coding comprehension), 3) strict instruction following with implicit output modification instead of directly copying code from the website.",
"tags": [
"instruction-following",
"browser",
"coding",
"python",
"target-searching"
]
},
"data": {
"name": "0",
"question": "Google search 'The most obscure Hello, world! program', choose the link from the website mathspp, read the page, and write a Python script named 'hello_world.py' that faithfully reproduces the exact obfuscated Hello World program shown on that page. Do not simplify or rewrite it — copy the same structure, tricks, and naming conventions used by the author. The script must print 'Hello, World!' when run.",
"question": "Find 'obscure hello world program' from mathspp, read the page, and write a Python script named 'hello_world.py' that faithfully reproduces the obfuscated Hello World program shown on that page. Do not simplify or rewrite it, just use the same structure, tricks, and naming conventions used by the author. Notice that the script MUST print 'Hello, WORLD!' when run.",
"env": {}
},
"model_kwargs": {
"model_platform": "openai",
"model_type": "gpt-5.2"
},
"tests": {
"checker": ["benchmark/checker/0.py"],
"grader": ["benchmark/grader/0.py"]

View file

@ -0,0 +1,22 @@
{
"metadata": {
"difficulty": "easy",
"description": "1) agent autonomously triggers search/browser to retrieve real data instead of hallucinating, 2) browser use with scrolling to locate specific modules, 3) instruction following for file creation with specific name and format.",
"tags": [
"browser",
"research",
"markdown",
"instruction-following",
"code-related"
]
},
"data": {
"name": "1",
"question": "Find what's new in Python 3.13 for the `warnings` and `multiprocessing` modules. Create a markdown file named 'python313_features.md' with each module name as a heading (#) and the exact text description from the official documentation as the content below each heading. Only make sure any code or script references are wrapped in backticks.",
"env": {}
},
"tests": {
"checker": ["benchmark/checker/1.py"],
"grader": ["benchmark/grader/1.py"]
}
}

View file

@ -0,0 +1,16 @@
{
"metadata": {
"difficulty": "medium",
"description": "1) benchmark browser use capability with in-depth browser operations, 2) document generation with strict format constraints on the CSV generation, 3) implicit classification for each company's category.",
"tags": ["browser", "research", "data-extraction", "csv", "multi-step"]
},
"data": {
"name": "2",
"question": "Identify all B2B companies in the Y Combinator Winter 2025 batch whose product is related to AI. After you obtain the full company list, independently investigate each company's product information in detail and consolidate all findings into a clean, well-structured CSV file named 'yc_w25_b2b_ai.csv' with columns: company_name (in lowercase), product_description (100 chars max), ai_category (use a consistent set of values including 'ai-agents', 'ai-infrastructure', 'ai-developer-tools', 'ai-analytics', 'ai-security', 'ai-healthcare', 'ai-sales', 'ai-productivity', 'ai-customer-support', 'ai-coding', 'ai-data', 'ai-fintech', 'ai-legal', 'ai-hr', 'ai-marketing', and 'ai-other').",
"env": {}
},
"tests": {
"checker": ["benchmark/checker/2.py"],
"grader": ["benchmark/grader/2.py"]
}
}

View file

@ -16,11 +16,16 @@ import json
import os
from pathlib import Path
from dotenv import dotenv_values
from dotenv import dotenv_values, load_dotenv
from pydantic import BaseModel
from app.model.chat import Chat, McpServers
# Load benchmark env files (.env takes priority over .env.development)
_BENCHMARK_DIR = Path(__file__).resolve().parent
load_dotenv(_BENCHMARK_DIR / ".env")
load_dotenv(_BENCHMARK_DIR / ".env.development")
class Env(BaseModel):
# TODO: add more environment variables
@ -37,10 +42,12 @@ class Tests(BaseModel):
class ModelKwargs(BaseModel):
model_platform: str = "openai"
model_type: str = "gpt-4o"
api_key: str | None = None
api_url: str | None = None
model_platform: str = os.environ.get("BENCHMARK_MODEL_PLATFORM", "openai")
model_type: str = os.environ.get("BENCHMARK_MODEL_TYPE", "gpt-5.2")
api_key: str | None = os.environ.get("BENCHMARK_API_KEY")
api_url: str = os.environ.get(
"BENCHMARK_API_URL", "https://api.openai.com/v1"
)
class Metadata(BaseModel):
@ -64,7 +71,11 @@ class BenchmarkData(BaseModel):
server_env.update(env_vars)
server_cfg["env"] = server_env
api_key = model_kwargs.api_key or os.environ["OPENAI_API_KEY"]
api_key = (
model_kwargs.api_key
or os.environ.get("BENCHMARK_API_KEY")
or os.environ["OPENAI_API_KEY"]
)
self._chat = Chat(
task_id=f"benchmark_{self.name}",

View file

@ -16,6 +16,7 @@ import ast
import json
import sys
from pathlib import Path
from urllib.parse import urlparse
BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
@ -63,63 +64,103 @@ def grade(working_directory: str) -> tuple[int, int]:
# 1. Visited mathspp.com blog page
visited = _visited_urls()
if any(
"mathspp.com/blog/the-most-obscure-hello-world" in u for u in visited
(p := urlparse(u)).hostname is not None
and (
p.hostname == "mathspp.com" or p.hostname.endswith(".mathspp.com")
)
and "/blog/the-most-obscure-hello-world" in p.path
for u in visited
):
completed += 1
else:
print(
"MISS [1]: did not visit "
"mathspp.com/blog/the-most-obscure-hello-world"
)
script = Path(working_directory) / "hello_world.py"
if not script.exists():
print("MISS [2-7]: hello_world.py does not exist")
return completed, total
source = script.read_text()
tree = ast.parse(source)
# 1. Uses a decorator that immediately instantiates a class
# 2. Uses a decorator that immediately instantiates a class
found = False
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and node.decorator_list:
found = True
completed += 1
break
if not found:
print("MISS [2]: no decorated class definition found")
# 2. Overloads __format__
# 3. Overloads __format__
found = False
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name == "__format__":
found = True
completed += 1
break
if not found:
print("MISS [3]: no __format__ method found")
# 3. Uses property injection on the class
# 4. Uses property injection on the class
if "property" in source:
completed += 1
else:
print("MISS [4]: no 'property' usage found in source")
# 4. __format__ returns an empty string
# 5. __format__ returns an empty string
found = False
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name == "__format__":
for child in ast.walk(node):
if isinstance(child, ast.Return
) and isinstance(child.value, ast.Constant):
if isinstance(child, ast.Return) and isinstance(
child.value, ast.Constant
):
if child.value.value == "":
found = True
completed += 1
break
break
if not found:
print('MISS [5]: __format__ does not return an empty string ""')
# 5. Uses function annotation to trigger f-string evaluation
# 6. Uses function annotation to trigger f-string evaluation
found = False
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.returns is not None:
if isinstance(node.returns, ast.JoinedStr):
found = True
completed += 1
break
if not found:
print(
"MISS [6]: no function annotation with f-string (JoinedStr) found"
)
# 6. Uses _ as both class name and instance variable
# 7. Uses _ as both class name and instance variable
has_class_underscore = False
has_attr_underscore = False
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and node.name == "_":
has_class_underscore = True
if isinstance(node,
ast.Attribute) and isinstance(node.value, ast.Name):
if isinstance(node, ast.Attribute) and isinstance(
node.value, ast.Name
):
if node.value.id == "_" and node.attr == "_":
has_attr_underscore = True
if has_class_underscore and has_attr_underscore:
completed += 1
else:
parts = []
if not has_class_underscore:
parts.append("no class named '_'")
if not has_attr_underscore:
parts.append("no _._ attribute access")
print(f"MISS [7]: {', '.join(parts)}")
return completed, total

View file

@ -0,0 +1,139 @@
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Grader for benchmark 1: evaluate python313_features.md milestones."""
import json
import re
import sys
from pathlib import Path
from urllib.parse import urlparse
BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
def _visited_urls() -> set[str]:
"""Extract all URLs seen in browser logs."""
urls: set[str] = set()
if not BROWSER_LOG_DIR.exists():
return urls
for log_file in BROWSER_LOG_DIR.glob("hybrid_browser_toolkit_ws_*.log"):
decoder = json.JSONDecoder()
raw = log_file.read_text()
pos = 0
while pos < len(raw):
stripped = raw[pos:].lstrip()
if not stripped:
break
pos = len(raw) - len(stripped)
try:
obj, end = decoder.raw_decode(raw, pos)
pos = end
if not isinstance(obj, dict):
continue
action = obj.get("action", "")
if action == "visit_page":
args = obj.get("inputs", {}).get("args", [])
if args:
urls.add(args[0])
except (json.JSONDecodeError, ValueError):
pos += 1
return urls
def grade(working_directory: str) -> tuple[int, int]:
total = 7
completed = 0
md_file = Path(working_directory) / "python313_features.md"
# 1. Visited the Python 3.13 What's New page
visited = _visited_urls()
if any(
(p := urlparse(u)).hostname is not None
and (
p.hostname == "docs.python.org"
or p.hostname.endswith(".docs.python.org")
)
and "3.13" in p.path
for u in visited
):
completed += 1
else:
print("MISS [1]: did not visit docs.python.org/3.13 What's New page")
if not md_file.exists():
print("MISS [2-7]: python313_features.md does not exist")
return completed, total
content = md_file.read_text()
lower = content.lower()
# 2. Has a # warnings heading
if re.search(r"^# warnings\b", content, re.MULTILINE | re.IGNORECASE):
completed += 1
else:
print("MISS [2]: no '# warnings' heading found")
# 3. Has a # multiprocessing heading
if re.search(
r"^# multiprocessing\b", content, re.MULTILINE | re.IGNORECASE
):
completed += 1
else:
print("MISS [3]: no '# multiprocessing' heading found")
# 4. Mentions warnings.deprecated() with backticks
if "`warnings.deprecated()`" in content or (
"warnings.deprecated" in lower and "`" in content
):
completed += 1
else:
print(
"MISS [4]: missing `warnings.deprecated()` "
"(expected backtick-wrapped reference)"
)
# 5. Mentions PEP 702
if "pep 702" in lower:
completed += 1
else:
print("MISS [5]: no mention of PEP 702")
# 6. Mentions os.process_cpu_count() with backticks
if "`os.process_cpu_count()`" in content or (
"os.process_cpu_count" in lower and "`" in content
):
completed += 1
else:
print(
"MISS [6]: missing `os.process_cpu_count()` "
"(expected backtick-wrapped reference)"
)
# 7. Mentions os.cpu_count() (the old default being replaced)
if "os.cpu_count" in lower:
completed += 1
else:
print("MISS [7]: no mention of os.cpu_count()")
return completed, total
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <working_directory>")
sys.exit(1)
completed, total = grade(sys.argv[1])
print(f"{completed}/{total}")
sys.exit(0 if completed == total else 1)

View file

@ -0,0 +1,261 @@
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Grader for benchmark 2: evaluate yc_w25_b2b_ai.csv milestones."""
import csv
import json
import sys
from collections import Counter
from pathlib import Path
from urllib.parse import urlparse
BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
ANSWER_CSV = (
Path(__file__).resolve().parents[1] / "answer" / "2" / "yc_w25_b2b_ai.csv"
)
VALID_CATEGORIES = {
"ai-agents",
"ai-infrastructure",
"ai-developer-tools",
"ai-analytics",
"ai-security",
"ai-healthcare",
"ai-sales",
"ai-productivity",
"ai-customer-support",
"ai-coding",
"ai-data",
"ai-fintech",
"ai-legal",
"ai-hr",
"ai-marketing",
"ai-other",
}
REQUIRED_COLUMNS = {"company_name", "product_description", "ai_category"}
def _visited_urls() -> set[str]:
"""Extract all URLs seen in browser logs."""
urls: set[str] = set()
if not BROWSER_LOG_DIR.exists():
return urls
for log_file in BROWSER_LOG_DIR.glob("hybrid_browser_toolkit_ws_*.log"):
decoder = json.JSONDecoder()
raw = log_file.read_text()
pos = 0
while pos < len(raw):
stripped = raw[pos:].lstrip()
if not stripped:
break
pos = len(raw) - len(stripped)
try:
obj, end = decoder.raw_decode(raw, pos)
pos = end
if not isinstance(obj, dict):
continue
action = obj.get("action", "")
if action == "visit_page":
args = obj.get("inputs", {}).get("args", [])
if args:
urls.add(args[0])
except (json.JSONDecodeError, ValueError):
pos += 1
return urls
def _load_answer() -> tuple[int, Counter]:
"""Load expected company count and category distribution from answer CSV."""
cat_counts: Counter = Counter()
count = 0
if not ANSWER_CSV.exists():
return 0, cat_counts
with open(ANSWER_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
count += 1
cat = row.get("ai_category", "")
if cat:
cat_counts[cat] += 1
return count, cat_counts
def _category_overlap(expected: Counter, actual: Counter) -> float:
"""Compute distribution overlap between expected and actual categories.
Normalizes both to proportions, then sums min(expected_pct, actual_pct)
for each category. Returns a value between 0.0 and 1.0.
"""
exp_total = sum(expected.values())
act_total = sum(actual.values())
if exp_total == 0 or act_total == 0:
return 0.0
all_cats = set(expected.keys()) | set(actual.keys())
overlap = 0.0
for cat in all_cats:
exp_pct = expected.get(cat, 0) / exp_total
act_pct = actual.get(cat, 0) / act_total
overlap += min(exp_pct, act_pct)
return overlap
def grade(working_directory: str) -> tuple[int, int]:
total = 10
completed = 0
csv_file = Path(working_directory) / "yc_w25_b2b_ai.csv"
# 1. Visited YC W25 companies page
visited = _visited_urls()
if any(
(p := urlparse(u)).hostname is not None
and (
p.hostname == "ycombinator.com"
or p.hostname.endswith(".ycombinator.com")
)
and "W25" in u
for u in visited
):
completed += 1
else:
print("MISS [1]: did not visit ycombinator.com W25 companies page")
# 2. CSV file exists
if not csv_file.exists():
print(f"MISS [2-10]: {csv_file.name} does not exist")
return completed, total
completed += 1
try:
with open(csv_file, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
headers = set(reader.fieldnames or [])
rows = list(reader)
except Exception as e:
print(f"MISS [3-10]: failed to parse CSV: {e}")
return completed, total
# 3. Has correct columns
if REQUIRED_COLUMNS.issubset(headers):
completed += 1
else:
missing = REQUIRED_COLUMNS - headers
print(f"MISS [3]: missing columns: {missing}")
# 4. All company_name values are lowercase
non_lower = [
row.get("company_name", "")
for row in rows
if row.get("company_name", "") != row.get("company_name", "").lower()
]
if rows and not non_lower:
completed += 1
else:
print(
f"MISS [4]: {len(non_lower)} company_name(s) not lowercase, "
f"e.g. {non_lower[:3]}"
)
# 5. All product_description values are <= 100 chars
too_long = [
(i, len(row.get("product_description", "")))
for i, row in enumerate(rows)
if len(row.get("product_description", "")) > 100
]
if rows and not too_long:
completed += 1
else:
print(
f"MISS [5]: {len(too_long)} description(s) exceed 100 chars, "
f"e.g. row {too_long[0][0]} has {too_long[0][1]} chars"
if too_long
else "MISS [5]: no rows found"
)
# 6. All ai_category values are valid enums
invalid_cats = [
(i, row.get("ai_category", ""))
for i, row in enumerate(rows)
if row.get("ai_category", "") not in VALID_CATEGORIES
]
if rows and not invalid_cats:
completed += 1
else:
print(
f"MISS [6]: {len(invalid_cats)} invalid category value(s), "
f"e.g. row {invalid_cats[0][0]}: '{invalid_cats[0][1]}'"
if invalid_cats
else "MISS [6]: no rows found"
)
# Load answer for approximate matching
expected_count, expected_cats = _load_answer()
actual_count = len(rows)
# 7-8. Company count within 50% → +1, within 25% → +1 more
if expected_count > 0 and actual_count > 0:
ratio = actual_count / expected_count
if 0.5 <= ratio <= 1.5:
completed += 1
if 0.75 <= ratio <= 1.25:
completed += 1
else:
print(
f"MISS [8]: count {actual_count} is within 50% but not "
f"25% of expected {expected_count} (ratio={ratio:.2f})"
)
else:
print(
f"MISS [7-8]: count {actual_count} is not within 50% of "
f"expected {expected_count} (ratio={ratio:.2f})"
)
else:
print(
f"MISS [7-8]: expected_count={expected_count}, "
f"actual_count={actual_count}"
)
# 9-10. Category distribution overlap >= 50% → +1, >= 75% → +1 more
actual_cats: Counter = Counter()
for row in rows:
cat = row.get("ai_category", "")
if cat:
actual_cats[cat] += 1
overlap = _category_overlap(expected_cats, actual_cats)
if overlap >= 0.50:
completed += 1
if overlap >= 0.75:
completed += 1
else:
print(
f"MISS [10]: category overlap {overlap:.2%} >= 50% but < 75%"
)
else:
print(
f"MISS [9-10]: category overlap {overlap:.2%} < 50%. "
f"Expected dist: {dict(expected_cats)}, "
f"actual dist: {dict(actual_cats)}"
)
return completed, total
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <working_directory>")
sys.exit(1)
completed, total = grade(sys.argv[1])
print(f"{completed}/{total}")
sys.exit(0 if completed == total else 1)

View file

@ -15,21 +15,21 @@
import asyncio
import csv
import importlib.util
import shutil
import sys
from datetime import datetime
from pathlib import Path
from benchmark.client import BenchmarkClient
from benchmark.environment import BenchmarkConfig
from benchmark.environment import BenchmarkConfig, ModelKwargs
DATASET_DIR = Path(__file__).parent / "dataset"
RESULTS_DIR = Path(__file__).parent
BROWSER_LOG_DIR = Path(__file__).parent.parent / "browser_log"
async def run_benchmark(
client: BenchmarkClient,
benchmark_path: Path,
verbose: bool = False
client: BenchmarkClient, benchmark_path: Path, verbose: bool = False
) -> dict:
"""Load a benchmark config and run it.
@ -43,15 +43,28 @@ async def run_benchmark(
dict: Results including benchmark name, model, checker and
grader outcomes.
"""
# Clear browser logs so previous benchmark visits don't leak into this run
if BROWSER_LOG_DIR.exists():
for log_file in BROWSER_LOG_DIR.iterdir():
if log_file.is_file():
log_file.unlink()
config = BenchmarkConfig.from_json(benchmark_path)
data = config.data
model_kwargs = config.model_kwargs
model = f"{model_kwargs.model_platform}/{model_kwargs.model_type}"
# Clear previous working directory so results are from a fresh run
working_dir_path = Path(data.get_working_directory(model_kwargs))
if working_dir_path.exists():
shutil.rmtree(working_dir_path)
working_dir_path.mkdir(parents=True, exist_ok=True)
print(f"--- Benchmark: {data.name} ---")
print(f"Question: {data.question}")
print(f"Model: {model}")
print(f"Working directory: {data.get_working_directory(model_kwargs)}")
print(f"Working directory: {working_dir_path}")
print(f"Checkers: {config.tests.checker}")
print(f"Graders: {config.tests.grader}")
@ -133,6 +146,13 @@ async def main() -> None:
print(f"No benchmark configs found in {DATASET_DIR}")
return
defaults = ModelKwargs()
print("=== Benchmark Model Configuration ===")
print(f" Platform: {defaults.model_platform}")
print(f" Model: {defaults.model_type}")
print(f" API URL: {defaults.api_url}")
print()
all_results = []
async with BenchmarkClient() as client:
for path in paths:

View file

@ -38,6 +38,7 @@ dev = [
[tool.ruff]
line-length = 79
target-version = "py311"
exclude = ["benchmark/answer"]
[tool.ruff.lint]
select = [
@ -70,7 +71,7 @@ quote-style = "double"
indent-style = "space"
[tool.bandit]
exclude_dirs = ["tests", ".venv", "venv"]
exclude_dirs = ["tests", ".venv", "venv", "benchmark/answer"]
skips = [
"B101", # assert_used - OK in non-production code
"B105", # hardcoded_password_string - false positive on env var names