benchmark: update benchmark (#1207)

Co-authored-by: bytecii <bytecii@users.noreply.github.com> Co-authored-by: Wendong-Fan <w3ndong.fan@gmail.com> Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
2026-04-28 03:30:06 +00:00 · 2026-02-12 00:35:18 -08:00 · 2026-02-12 00:35:18 -08:00 · f7bf29a40a
commit f7bf29a40a
parent c5ec78e5f9
21 changed files with 840 additions and 35 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -39,6 +39,7 @@ jobs:
              app/middleware \
              app/model \
              app/service \
+              benchmark \
              tests/app \
              -type f ! -path '*__pycache__*') \
            app/__init__.py \
--- a/.markdownlintignore
+++ b/.markdownlintignore
@ -7,3 +7,4 @@ README_PT-BR.md
 server/README_CN.md
 server/README_EN.md
 docs/troubleshooting/bug.md
+backend/benchmark/answer/
--- a/backend/.pre-commit-config.yaml
+++ b/backend/.pre-commit-config.yaml
@ -38,8 +38,10 @@ repos:
      - id: ruff
        name: Ruff lint (auto-fix)
        args: [--fix]
+        exclude: 'benchmark/answer/'
      - id: ruff-format
        name: Ruff format
+        exclude: 'benchmark/answer/'

  # Security scanning
  - repo: https://github.com/PyCQA/bandit
@ -56,6 +58,7 @@ repos:
    hooks:
      - id: mdformat
        name: Format Markdown
+        exclude: 'benchmark/answer/'
        additional_dependencies:
          - mdformat-gfm
          - mdformat_frontmatter
--- a/backend/benchmark/.env.example
+++ b/backend/benchmark/.env.example
@ -0,0 +1,4 @@
+BENCHMARK_MODEL_PLATFORM="openai"
+BENCHMARK_MODEL_TYPE="gpt-5.2"
+BENCHMARK_API_KEY=""
+BENCHMARK_API_URL="https://api.openai.com/v1"
--- a/backend/benchmark/README.md
+++ b/backend/benchmark/README.md
@ -76,7 +76,29 @@ The `metadata` field (optional) provides information about the benchmark:
 - `description`: Brief explanation of what skills or capabilities the benchmark tests
 - `tags`: Array of keywords for filtering and organization

-`model_platform` and `model_type` default to `"openai"` and `"gpt-4o"`. `api_key` defaults to `$OPENAI_API_KEY`. Set `api_url` for custom endpoints.
+The `model_kwargs` field is optional. Defaults come from `BENCHMARK_*` environment variables (see below), falling back to `openai` / `gpt-5.2` / `$OPENAI_API_KEY`. Per-benchmark JSON values override the environment defaults.
+
+### Custom model providers
+
+You can override the model for all benchmarks via environment variables (see `.env.example`):
+
+```bash
+export BENCHMARK_MODEL_PLATFORM="openai-compatible-model"
+export BENCHMARK_MODEL_TYPE=""
+export BENCHMARK_API_KEY=""
+export BENCHMARK_API_URL=""
+```
+
+| Variable                   | Default                     | Description                                                                 |
+| -------------------------- | --------------------------- | --------------------------------------------------------------------------- |
+| `BENCHMARK_MODEL_PLATFORM` | `openai`                    | Provider name. Use `openai-compatible-model` for any OpenAI-compatible API. |
+| `BENCHMARK_MODEL_TYPE`     | `gpt-5.2`                   | Model identifier passed to the provider.                                    |
+| `BENCHMARK_API_KEY`        | `$OPENAI_API_KEY`           | API key for the provider.                                                   |
+| `BENCHMARK_API_URL`        | `https://api.openai.com/v1` | Base URL for the provider's API.                                            |
+
+> **Important:** If the model is served through an OpenAI-compatible API (e.g. DeepSeek, MiniMax, Ollama, vLLM, LiteLLM, or any other non-OpenAI provider), set `BENCHMARK_MODEL_PLATFORM` to `openai-compatible-model` — **not** `openai`. The `openai` platform value is reserved for the official OpenAI API only.
+
+To override a single benchmark, add `model_kwargs` to its JSON config — these take priority over environment variables.

 2. Create `benchmark/checker/<n>.py` with a `check(working_directory: str) -> bool` function.

--- a/backend/benchmark/init.py
+++ b/backend/benchmark/init.py
@ -11,4 +11,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
-
--- a/backend/benchmark/answer/0/hello_world.py
+++ b/backend/benchmark/answer/0/hello_world.py
@ -0,0 +1,25 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+
+@lambda _: _()
+class _:
+    def __format__(_, __):
+        _.__class__._ = property(lambda _: print(__))
+        return ""
+
+
+def __() -> f"{_:Hello, WORLD!}": ...
+
+
+_._
--- a/backend/benchmark/answer/1/python313_features.md
+++ b/backend/benchmark/answer/1/python313_features.md
@ -0,0 +1,7 @@
+# warnings
+
+PEP 702: The new `warnings.deprecated()` decorator provides a way to communicate deprecations to a static type checker and to warn on usage of deprecated classes and functions. A `DeprecationWarning` may also be emitted when a decorated function or class is used at runtime. (Contributed by Jelle Zijlstra in `gh-104003`.)
+
+# multiprocessing
+
+The default number of worker threads and processes is now selected using `os.process_cpu_count()` instead of `os.cpu_count()`. (Contributed by Victor Stinner in `gh-109649`.)
--- a/backend/benchmark/answer/2/yc_w25_b2b_ai.csv
+++ b/backend/benchmark/answer/2/yc_w25_b2b_ai.csv
@ -0,0 +1,77 @@
+company_name,product_description,ai_category
+fira,Agentic AI platform for investment firms,ai-fintech
+assistant-ui,Open-source React.js library for AI chat,ai-developer-tools
+artifact,Collaborative AI-native IDE for hardware engineers,ai-developer-tools
+axal,AI observability for modular codebase architecture,ai-developer-tools
+trainloop,Reasoning fine-tuning platform for AI models,ai-infrastructure
+tally,AI agents for accounting firms automating repetitive tasks,ai-agents
+sammy labs,AI that maps every click path in software for user onboarding,ai-customer-support
+mercura,AI quoting for distributors and manufacturers,ai-sales
+cedar,In-product AI copilot for any app,ai-productivity
+browser use,Open-source web agents automating browser workflows,ai-agents
+tamlabs,AI-native document editor for Microsoft Word,ai-productivity
+copycat,Next-gen RPA powered by browser agents,ai-agents
+wildcard,Make APIs work for AI agents,ai-infrastructure
+mastra,JavaScript framework for building AI agents,ai-developer-tools
+afterquery,High-quality datasets and benchmarks for AI model training,ai-data
+fuse ai,AI agents to replace Salesforce,ai-sales
+peppr,Self-improving knowledge base synthesizing company data,ai-productivity
+sennu ai,AI agents automating the tech consulting market,ai-agents
+mesh,AI finance co-worker providing real-time insights,ai-fintech
+outlit,AI agents for enterprise deal creation,ai-sales
+tire swing,AI for healthcare compliance,ai-healthcare
+calltree ai,Enterprise-grade AI support reps for call centers,ai-customer-support
+operand,B2B knowledge management platform with AI search,ai-data
+gulp information services,Real-time self-improvement infrastructure for AI agents,ai-infrastructure
+zeroentropy,High accuracy search API over unstructured data,ai-infrastructure
+cardamon,AI compliance co-pilot for regulated financial businesses,ai-fintech
+tergle,AI agents for audit workflows,ai-fintech
+carecycle,Voice AI teams for Medicare agencies,ai-customer-support
+sift dev,AI-powered fraud decisioning for digital businesses,ai-security
+maive,AI-native manufacturing execution system for factory operations,ai-other
+weave,AI to measure and analyze engineering work,ai-analytics
+caseflood,AI inbound sales team for law firms,ai-legal
+tejas ai,Risk decisioning platform for banks powered by AI,ai-fintech
+vora ai,AI recruiter for hiring managers,ai-hr
+a0.dev,AI-powered mobile app builder,ai-coding
+general agency company,AI coworkers that can learn and act like humans,ai-agents
+a1base,Twilio for AI agents,ai-infrastructure
+verbiflow,AI-powered CRM that finds leads and closes deals,ai-sales
+contrario,Fully autonomous AI recruiting agency,ai-hr
+ovlo,Conversational AI for e-commerce sales,ai-sales
+truffle ai,AWS for AI agents,ai-infrastructure
+superglue,Self-healing integration agent for enterprise workflows,ai-infrastructure
+conntour,AI to monitor thousands of security cameras,ai-security
+promptless,AI teammate that auto-updates customer-facing docs,ai-productivity
+stamp,AI-native email client for professionals,ai-productivity
+guse,Prompt-to-automation platform for business workflows,ai-agents
+subimage,AI-powered infrastructure mapping and security platform,ai-security
+casixty,Reddit marketing agent for technical audiences,ai-marketing
+leaping ai,Self-improving voice AI agents for call center automation,ai-customer-support
+vetnio,AI copilot automating admin work for veterinary pros,ai-healthcare
+trace,Voice AI customer support for financial services,ai-customer-support
+quantstruct,AI documentation engineer for product docs,ai-developer-tools
+onlook,AI-powered visual editor for designers,ai-developer-tools
+pig,API for automating Windows apps with AI,ai-developer-tools
+vantel,AI software for commercial insurance brokers,ai-fintech
+agentin ai,AI agents automating enterprise software processes,ai-agents
+solidroad,AI agents for sales and support team training,ai-customer-support
+trata,AI-powered research desk for hedge funds,ai-analytics
+sophris,AI engineer for electronic design automation,ai-developer-tools
+mundo ai,High quality multilingual training data for AI models,ai-data
+athenahq,AI-powered brand discovery optimization for ChatGPT,ai-marketing
+lopus ai,AI agents for revenue intelligence,ai-sales
+harbera,AI healthcare provider credentialing software,ai-healthcare
+augento,Improving AI agents through reinforcement learning,ai-infrastructure
+macadamia,AI mechanical engineer that detects and fixes design errors,ai-other
+asteroid,Browser agents for regulated industries,ai-agents
+gale,AI-powered immigration law firm,ai-legal
+olive,Build internal tools with natural language and AI,ai-developer-tools
+cuckoo labs,Real-time AI translator for sales and marketing teams,ai-marketing
+mosaic,AI agents for video editing workflows,ai-agents
+oki,Track company progress with AI analytics,ai-analytics
+amby health,AI copilot for ambulance agencies,ai-healthcare
+g lnk,AI collaboration platform for healthcare organizations,ai-healthcare
+artificial societies,AI simulation of target audiences for marketing predictions,ai-marketing
+overstand labs,AI insights from customer communications across channels,ai-analytics
+lucidic ai,Analytics and simulation tools for AI agents,ai-analytics
--- a/backend/benchmark/checker/0.py
+++ b/backend/benchmark/checker/0.py
@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
-"""Checker for benchmark 0: hello_world.py should print 'Hello, World!'"""
+"""Checker for benchmark 0: hello_world.py should print 'Hello, WORLD!'"""

 import subprocess
 import sys
@ -33,11 +33,11 @@ def check(working_directory: str) -> bool:
    )

    output = result.stdout.strip()
-    if output == "Hello, World!":
+    if output == "Hello, WORLD!":
        print("PASS")
        return True
    else:
-        print(f"FAIL: expected 'Hello, World!', got '{output}'")
+        print(f"FAIL: expected 'Hello, WORLD!', got '{output}'")
        return False


--- a/backend/benchmark/checker/1.py
+++ b/backend/benchmark/checker/1.py
@ -0,0 +1,61 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+"""Checker for benchmark 1: python313_features.md with warnings and
+multiprocessing sections."""
+
+import re
+import sys
+from pathlib import Path
+
+
+def check(working_directory: str) -> bool:
+    md_file = Path(working_directory) / "python313_features.md"
+
+    if not md_file.exists():
+        print(f"FAIL: {md_file} does not exist")
+        return False
+
+    content = md_file.read_text()
+
+    if len(content.strip()) < 50:
+        print("FAIL: file content is too short")
+        return False
+
+    # Check for at least 2 heading sections (# warnings, # multiprocessing)
+    h1_sections = re.findall(r"^# .+", content, re.MULTILINE)
+    if len(h1_sections) < 2:
+        print(
+            f"FAIL: expected at least 2 # sections, found {len(h1_sections)}"
+        )
+        return False
+
+    lower = content.lower()
+    if "warnings" not in lower:
+        print("FAIL: missing warnings section")
+        return False
+
+    if "multiprocessing" not in lower:
+        print("FAIL: missing multiprocessing section")
+        return False
+
+    print("PASS")
+    return True
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <working_directory>")
+        sys.exit(1)
+    success = check(sys.argv[1])
+    sys.exit(0 if success else 1)
--- a/backend/benchmark/checker/2.py
+++ b/backend/benchmark/checker/2.py
@ -0,0 +1,92 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+"""Checker for benchmark 2: yc_w25_b2b_ai.csv with B2B AI companies."""
+
+import csv
+import sys
+from pathlib import Path
+
+VALID_CATEGORIES = {
+    "ai-agents",
+    "ai-infrastructure",
+    "ai-developer-tools",
+    "ai-analytics",
+    "ai-security",
+    "ai-healthcare",
+    "ai-sales",
+    "ai-productivity",
+    "ai-customer-support",
+    "ai-coding",
+    "ai-data",
+    "ai-fintech",
+    "ai-legal",
+    "ai-hr",
+    "ai-marketing",
+    "ai-other",
+}
+
+REQUIRED_COLUMNS = {"company_name", "product_description", "ai_category"}
+
+
+def check(working_directory: str) -> bool:
+    csv_file = Path(working_directory) / "yc_w25_b2b_ai.csv"
+
+    if not csv_file.exists():
+        print(f"FAIL: {csv_file} does not exist")
+        return False
+
+    with open(csv_file, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        headers = set(reader.fieldnames or [])
+
+        missing = REQUIRED_COLUMNS - headers
+        if missing:
+            print(f"FAIL: missing columns: {missing}")
+            return False
+
+        rows = list(reader)
+
+    if len(rows) < 5:
+        print(f"FAIL: expected at least 5 companies, got {len(rows)}")
+        return False
+
+    for i, row in enumerate(rows):
+        name = row.get("company_name", "")
+        if name != name.lower():
+            print(f"FAIL: row {i}: company_name '{name}' is not lowercase")
+            return False
+
+        desc = row.get("product_description", "")
+        if len(desc) > 100:
+            print(
+                f"FAIL: row {i}: product_description exceeds 100 chars "
+                f"({len(desc)})"
+            )
+            return False
+
+        cat = row.get("ai_category", "")
+        if cat not in VALID_CATEGORIES:
+            print(f"FAIL: row {i}: invalid ai_category '{cat}'")
+            return False
+
+    print("PASS")
+    return True
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <working_directory>")
+        sys.exit(1)
+    success = check(sys.argv[1])
+    sys.exit(0 if success else 1)
--- a/backend/benchmark/dataset/0.json
+++ b/backend/benchmark/dataset/0.json
@ -1,18 +1,20 @@
 {
  "metadata": {
    "difficulty": "easy",
-    "description": "Google a specific blog post on mathspp.com about obfuscated Python, read and understand the code tricks, then faithfully reproduce the exact program as hello_world.py.",
-    "tags": ["browser", "coding", "python", "target-searching"]
+    "description": "1) search tool usage and choosing the appropriate website from results, 2) interpreting advanced obfuscated Python code patterns (requires deep coding comprehension), 3) strict instruction following with implicit output modification instead of directly copying code from the website.",
+    "tags": [
+      "instruction-following",
+      "browser",
+      "coding",
+      "python",
+      "target-searching"
+    ]
  },
  "data": {
    "name": "0",
-    "question": "Google search 'The most obscure Hello, world! program', choose the link from the website mathspp, read the page, and write a Python script named 'hello_world.py' that faithfully reproduces the exact obfuscated Hello World program shown on that page. Do not simplify or rewrite it — copy the same structure, tricks, and naming conventions used by the author. The script must print 'Hello, World!' when run.",
+    "question": "Find 'obscure hello world program' from mathspp, read the page, and write a Python script named 'hello_world.py' that faithfully reproduces the obfuscated Hello World program shown on that page. Do not simplify or rewrite it, just use the same structure, tricks, and naming conventions used by the author. Notice that the script MUST print 'Hello, WORLD!' when run.",
    "env": {}
  },
-  "model_kwargs": {
-    "model_platform": "openai",
-    "model_type": "gpt-5.2"
-  },
  "tests": {
    "checker": ["benchmark/checker/0.py"],
    "grader": ["benchmark/grader/0.py"]
--- a/backend/benchmark/dataset/1.json
+++ b/backend/benchmark/dataset/1.json
@ -0,0 +1,22 @@
+{
+  "metadata": {
+    "difficulty": "easy",
+    "description": "1) agent autonomously triggers search/browser to retrieve real data instead of hallucinating, 2) browser use with scrolling to locate specific modules, 3) instruction following for file creation with specific name and format.",
+    "tags": [
+      "browser",
+      "research",
+      "markdown",
+      "instruction-following",
+      "code-related"
+    ]
+  },
+  "data": {
+    "name": "1",
+    "question": "Find what's new in Python 3.13 for the `warnings` and `multiprocessing` modules. Create a markdown file named 'python313_features.md' with each module name as a heading (#) and the exact text description from the official documentation as the content below each heading. Only make sure any code or script references are wrapped in backticks.",
+    "env": {}
+  },
+  "tests": {
+    "checker": ["benchmark/checker/1.py"],
+    "grader": ["benchmark/grader/1.py"]
+  }
+}
--- a/backend/benchmark/dataset/2.json
+++ b/backend/benchmark/dataset/2.json
@ -0,0 +1,16 @@
+{
+  "metadata": {
+    "difficulty": "medium",
+    "description": "1) benchmark browser use capability with in-depth browser operations, 2) document generation with strict format constraints on the CSV generation, 3) implicit classification for each company's category.",
+    "tags": ["browser", "research", "data-extraction", "csv", "multi-step"]
+  },
+  "data": {
+    "name": "2",
+    "question": "Identify all B2B companies in the Y Combinator Winter 2025 batch whose product is related to AI. After you obtain the full company list, independently investigate each company's product information in detail and consolidate all findings into a clean, well-structured CSV file named 'yc_w25_b2b_ai.csv' with columns: company_name (in lowercase), product_description (100 chars max), ai_category (use a consistent set of values including 'ai-agents', 'ai-infrastructure', 'ai-developer-tools', 'ai-analytics', 'ai-security', 'ai-healthcare', 'ai-sales', 'ai-productivity', 'ai-customer-support', 'ai-coding', 'ai-data', 'ai-fintech', 'ai-legal', 'ai-hr', 'ai-marketing', and 'ai-other').",
+    "env": {}
+  },
+  "tests": {
+    "checker": ["benchmark/checker/2.py"],
+    "grader": ["benchmark/grader/2.py"]
+  }
+}
--- a/backend/benchmark/environment.py
+++ b/backend/benchmark/environment.py
@ -16,11 +16,16 @@ import json
 import os
 from pathlib import Path

-from dotenv import dotenv_values
+from dotenv import dotenv_values, load_dotenv
 from pydantic import BaseModel

 from app.model.chat import Chat, McpServers

+# Load benchmark env files (.env takes priority over .env.development)
+_BENCHMARK_DIR = Path(__file__).resolve().parent
+load_dotenv(_BENCHMARK_DIR / ".env")
+load_dotenv(_BENCHMARK_DIR / ".env.development")
+

 class Env(BaseModel):
    # TODO: add more environment variables
@ -37,10 +42,12 @@ class Tests(BaseModel):


 class ModelKwargs(BaseModel):
-    model_platform: str = "openai"
-    model_type: str = "gpt-4o"
-    api_key: str | None = None
-    api_url: str | None = None
+    model_platform: str = os.environ.get("BENCHMARK_MODEL_PLATFORM", "openai")
+    model_type: str = os.environ.get("BENCHMARK_MODEL_TYPE", "gpt-5.2")
+    api_key: str | None = os.environ.get("BENCHMARK_API_KEY")
+    api_url: str = os.environ.get(
+        "BENCHMARK_API_URL", "https://api.openai.com/v1"
+    )


 class Metadata(BaseModel):
@ -64,7 +71,11 @@ class BenchmarkData(BaseModel):
                server_env.update(env_vars)
                server_cfg["env"] = server_env

-        api_key = model_kwargs.api_key or os.environ["OPENAI_API_KEY"]
+        api_key = (
+            model_kwargs.api_key
+            or os.environ.get("BENCHMARK_API_KEY")
+            or os.environ["OPENAI_API_KEY"]
+        )

        self._chat = Chat(
            task_id=f"benchmark_{self.name}",
--- a/backend/benchmark/grader/0.py
+++ b/backend/benchmark/grader/0.py
@ -16,6 +16,7 @@ import ast
 import json
 import sys
 from pathlib import Path
+from urllib.parse import urlparse

 BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"

@ -63,63 +64,103 @@ def grade(working_directory: str) -> tuple[int, int]:
    # 1. Visited mathspp.com blog page
    visited = _visited_urls()
    if any(
-        "mathspp.com/blog/the-most-obscure-hello-world" in u for u in visited
+        (p := urlparse(u)).hostname is not None
+        and (
+            p.hostname == "mathspp.com" or p.hostname.endswith(".mathspp.com")
+        )
+        and "/blog/the-most-obscure-hello-world" in p.path
+        for u in visited
    ):
        completed += 1
+    else:
+        print(
+            "MISS [1]: did not visit "
+            "mathspp.com/blog/the-most-obscure-hello-world"
+        )

    script = Path(working_directory) / "hello_world.py"
    if not script.exists():
+        print("MISS [2-7]: hello_world.py does not exist")
        return completed, total

    source = script.read_text()
    tree = ast.parse(source)

-    # 1. Uses a decorator that immediately instantiates a class
+    # 2. Uses a decorator that immediately instantiates a class
+    found = False
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef) and node.decorator_list:
+            found = True
            completed += 1
            break
+    if not found:
+        print("MISS [2]: no decorated class definition found")

-    # 2. Overloads __format__
+    # 3. Overloads __format__
+    found = False
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name == "__format__":
+            found = True
            completed += 1
            break
+    if not found:
+        print("MISS [3]: no __format__ method found")

-    # 3. Uses property injection on the class
+    # 4. Uses property injection on the class
    if "property" in source:
        completed += 1
+    else:
+        print("MISS [4]: no 'property' usage found in source")

-    # 4. __format__ returns an empty string
+    # 5. __format__ returns an empty string
+    found = False
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name == "__format__":
            for child in ast.walk(node):
-                if isinstance(child, ast.Return
-                              ) and isinstance(child.value, ast.Constant):
+                if isinstance(child, ast.Return) and isinstance(
+                    child.value, ast.Constant
+                ):
                    if child.value.value == "":
+                        found = True
                        completed += 1
                        break
            break
+    if not found:
+        print('MISS [5]: __format__ does not return an empty string ""')

-    # 5. Uses function annotation to trigger f-string evaluation
+    # 6. Uses function annotation to trigger f-string evaluation
+    found = False
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.returns is not None:
            if isinstance(node.returns, ast.JoinedStr):
+                found = True
                completed += 1
                break
+    if not found:
+        print(
+            "MISS [6]: no function annotation with f-string (JoinedStr) found"
+        )

-    # 6. Uses _ as both class name and instance variable
+    # 7. Uses _ as both class name and instance variable
    has_class_underscore = False
    has_attr_underscore = False
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef) and node.name == "_":
            has_class_underscore = True
-        if isinstance(node,
-                      ast.Attribute) and isinstance(node.value, ast.Name):
+        if isinstance(node, ast.Attribute) and isinstance(
+            node.value, ast.Name
+        ):
            if node.value.id == "_" and node.attr == "_":
                has_attr_underscore = True
    if has_class_underscore and has_attr_underscore:
        completed += 1
+    else:
+        parts = []
+        if not has_class_underscore:
+            parts.append("no class named '_'")
+        if not has_attr_underscore:
+            parts.append("no _._ attribute access")
+        print(f"MISS [7]: {', '.join(parts)}")

    return completed, total

--- a/backend/benchmark/grader/1.py
+++ b/backend/benchmark/grader/1.py
@ -0,0 +1,139 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+"""Grader for benchmark 1: evaluate python313_features.md milestones."""
+
+import json
+import re
+import sys
+from pathlib import Path
+from urllib.parse import urlparse
+
+BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
+
+
+def _visited_urls() -> set[str]:
+    """Extract all URLs seen in browser logs."""
+    urls: set[str] = set()
+    if not BROWSER_LOG_DIR.exists():
+        return urls
+    for log_file in BROWSER_LOG_DIR.glob("hybrid_browser_toolkit_ws_*.log"):
+        decoder = json.JSONDecoder()
+        raw = log_file.read_text()
+        pos = 0
+        while pos < len(raw):
+            stripped = raw[pos:].lstrip()
+            if not stripped:
+                break
+            pos = len(raw) - len(stripped)
+            try:
+                obj, end = decoder.raw_decode(raw, pos)
+                pos = end
+                if not isinstance(obj, dict):
+                    continue
+                action = obj.get("action", "")
+                if action == "visit_page":
+                    args = obj.get("inputs", {}).get("args", [])
+                    if args:
+                        urls.add(args[0])
+            except (json.JSONDecodeError, ValueError):
+                pos += 1
+    return urls
+
+
+def grade(working_directory: str) -> tuple[int, int]:
+    total = 7
+    completed = 0
+
+    md_file = Path(working_directory) / "python313_features.md"
+
+    # 1. Visited the Python 3.13 What's New page
+    visited = _visited_urls()
+    if any(
+        (p := urlparse(u)).hostname is not None
+        and (
+            p.hostname == "docs.python.org"
+            or p.hostname.endswith(".docs.python.org")
+        )
+        and "3.13" in p.path
+        for u in visited
+    ):
+        completed += 1
+    else:
+        print("MISS [1]: did not visit docs.python.org/3.13 What's New page")
+
+    if not md_file.exists():
+        print("MISS [2-7]: python313_features.md does not exist")
+        return completed, total
+
+    content = md_file.read_text()
+    lower = content.lower()
+
+    # 2. Has a # warnings heading
+    if re.search(r"^# warnings\b", content, re.MULTILINE | re.IGNORECASE):
+        completed += 1
+    else:
+        print("MISS [2]: no '# warnings' heading found")
+
+    # 3. Has a # multiprocessing heading
+    if re.search(
+        r"^# multiprocessing\b", content, re.MULTILINE | re.IGNORECASE
+    ):
+        completed += 1
+    else:
+        print("MISS [3]: no '# multiprocessing' heading found")
+
+    # 4. Mentions warnings.deprecated() with backticks
+    if "`warnings.deprecated()`" in content or (
+        "warnings.deprecated" in lower and "`" in content
+    ):
+        completed += 1
+    else:
+        print(
+            "MISS [4]: missing `warnings.deprecated()` "
+            "(expected backtick-wrapped reference)"
+        )
+
+    # 5. Mentions PEP 702
+    if "pep 702" in lower:
+        completed += 1
+    else:
+        print("MISS [5]: no mention of PEP 702")
+
+    # 6. Mentions os.process_cpu_count() with backticks
+    if "`os.process_cpu_count()`" in content or (
+        "os.process_cpu_count" in lower and "`" in content
+    ):
+        completed += 1
+    else:
+        print(
+            "MISS [6]: missing `os.process_cpu_count()` "
+            "(expected backtick-wrapped reference)"
+        )
+
+    # 7. Mentions os.cpu_count() (the old default being replaced)
+    if "os.cpu_count" in lower:
+        completed += 1
+    else:
+        print("MISS [7]: no mention of os.cpu_count()")
+
+    return completed, total
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <working_directory>")
+        sys.exit(1)
+    completed, total = grade(sys.argv[1])
+    print(f"{completed}/{total}")
+    sys.exit(0 if completed == total else 1)
--- a/backend/benchmark/grader/2.py
+++ b/backend/benchmark/grader/2.py
@ -0,0 +1,261 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+"""Grader for benchmark 2: evaluate yc_w25_b2b_ai.csv milestones."""
+
+import csv
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from urllib.parse import urlparse
+
+BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
+ANSWER_CSV = (
+    Path(__file__).resolve().parents[1] / "answer" / "2" / "yc_w25_b2b_ai.csv"
+)
+
+VALID_CATEGORIES = {
+    "ai-agents",
+    "ai-infrastructure",
+    "ai-developer-tools",
+    "ai-analytics",
+    "ai-security",
+    "ai-healthcare",
+    "ai-sales",
+    "ai-productivity",
+    "ai-customer-support",
+    "ai-coding",
+    "ai-data",
+    "ai-fintech",
+    "ai-legal",
+    "ai-hr",
+    "ai-marketing",
+    "ai-other",
+}
+
+REQUIRED_COLUMNS = {"company_name", "product_description", "ai_category"}
+
+
+def _visited_urls() -> set[str]:
+    """Extract all URLs seen in browser logs."""
+    urls: set[str] = set()
+    if not BROWSER_LOG_DIR.exists():
+        return urls
+    for log_file in BROWSER_LOG_DIR.glob("hybrid_browser_toolkit_ws_*.log"):
+        decoder = json.JSONDecoder()
+        raw = log_file.read_text()
+        pos = 0
+        while pos < len(raw):
+            stripped = raw[pos:].lstrip()
+            if not stripped:
+                break
+            pos = len(raw) - len(stripped)
+            try:
+                obj, end = decoder.raw_decode(raw, pos)
+                pos = end
+                if not isinstance(obj, dict):
+                    continue
+                action = obj.get("action", "")
+                if action == "visit_page":
+                    args = obj.get("inputs", {}).get("args", [])
+                    if args:
+                        urls.add(args[0])
+            except (json.JSONDecodeError, ValueError):
+                pos += 1
+    return urls
+
+
+def _load_answer() -> tuple[int, Counter]:
+    """Load expected company count and category distribution from answer CSV."""
+    cat_counts: Counter = Counter()
+    count = 0
+    if not ANSWER_CSV.exists():
+        return 0, cat_counts
+    with open(ANSWER_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            count += 1
+            cat = row.get("ai_category", "")
+            if cat:
+                cat_counts[cat] += 1
+    return count, cat_counts
+
+
+def _category_overlap(expected: Counter, actual: Counter) -> float:
+    """Compute distribution overlap between expected and actual categories.
+
+    Normalizes both to proportions, then sums min(expected_pct, actual_pct)
+    for each category. Returns a value between 0.0 and 1.0.
+    """
+    exp_total = sum(expected.values())
+    act_total = sum(actual.values())
+    if exp_total == 0 or act_total == 0:
+        return 0.0
+    all_cats = set(expected.keys()) | set(actual.keys())
+    overlap = 0.0
+    for cat in all_cats:
+        exp_pct = expected.get(cat, 0) / exp_total
+        act_pct = actual.get(cat, 0) / act_total
+        overlap += min(exp_pct, act_pct)
+    return overlap
+
+
+def grade(working_directory: str) -> tuple[int, int]:
+    total = 10
+    completed = 0
+
+    csv_file = Path(working_directory) / "yc_w25_b2b_ai.csv"
+
+    # 1. Visited YC W25 companies page
+    visited = _visited_urls()
+    if any(
+        (p := urlparse(u)).hostname is not None
+        and (
+            p.hostname == "ycombinator.com"
+            or p.hostname.endswith(".ycombinator.com")
+        )
+        and "W25" in u
+        for u in visited
+    ):
+        completed += 1
+    else:
+        print("MISS [1]: did not visit ycombinator.com W25 companies page")
+
+    # 2. CSV file exists
+    if not csv_file.exists():
+        print(f"MISS [2-10]: {csv_file.name} does not exist")
+        return completed, total
+    completed += 1
+
+    try:
+        with open(csv_file, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            headers = set(reader.fieldnames or [])
+            rows = list(reader)
+    except Exception as e:
+        print(f"MISS [3-10]: failed to parse CSV: {e}")
+        return completed, total
+
+    # 3. Has correct columns
+    if REQUIRED_COLUMNS.issubset(headers):
+        completed += 1
+    else:
+        missing = REQUIRED_COLUMNS - headers
+        print(f"MISS [3]: missing columns: {missing}")
+
+    # 4. All company_name values are lowercase
+    non_lower = [
+        row.get("company_name", "")
+        for row in rows
+        if row.get("company_name", "") != row.get("company_name", "").lower()
+    ]
+    if rows and not non_lower:
+        completed += 1
+    else:
+        print(
+            f"MISS [4]: {len(non_lower)} company_name(s) not lowercase, "
+            f"e.g. {non_lower[:3]}"
+        )
+
+    # 5. All product_description values are <= 100 chars
+    too_long = [
+        (i, len(row.get("product_description", "")))
+        for i, row in enumerate(rows)
+        if len(row.get("product_description", "")) > 100
+    ]
+    if rows and not too_long:
+        completed += 1
+    else:
+        print(
+            f"MISS [5]: {len(too_long)} description(s) exceed 100 chars, "
+            f"e.g. row {too_long[0][0]} has {too_long[0][1]} chars"
+            if too_long
+            else "MISS [5]: no rows found"
+        )
+
+    # 6. All ai_category values are valid enums
+    invalid_cats = [
+        (i, row.get("ai_category", ""))
+        for i, row in enumerate(rows)
+        if row.get("ai_category", "") not in VALID_CATEGORIES
+    ]
+    if rows and not invalid_cats:
+        completed += 1
+    else:
+        print(
+            f"MISS [6]: {len(invalid_cats)} invalid category value(s), "
+            f"e.g. row {invalid_cats[0][0]}: '{invalid_cats[0][1]}'"
+            if invalid_cats
+            else "MISS [6]: no rows found"
+        )
+
+    # Load answer for approximate matching
+    expected_count, expected_cats = _load_answer()
+    actual_count = len(rows)
+
+    # 7-8. Company count within 50% → +1, within 25% → +1 more
+    if expected_count > 0 and actual_count > 0:
+        ratio = actual_count / expected_count
+        if 0.5 <= ratio <= 1.5:
+            completed += 1
+            if 0.75 <= ratio <= 1.25:
+                completed += 1
+            else:
+                print(
+                    f"MISS [8]: count {actual_count} is within 50% but not "
+                    f"25% of expected {expected_count} (ratio={ratio:.2f})"
+                )
+        else:
+            print(
+                f"MISS [7-8]: count {actual_count} is not within 50% of "
+                f"expected {expected_count} (ratio={ratio:.2f})"
+            )
+    else:
+        print(
+            f"MISS [7-8]: expected_count={expected_count}, "
+            f"actual_count={actual_count}"
+        )
+
+    # 9-10. Category distribution overlap >= 50% → +1, >= 75% → +1 more
+    actual_cats: Counter = Counter()
+    for row in rows:
+        cat = row.get("ai_category", "")
+        if cat:
+            actual_cats[cat] += 1
+    overlap = _category_overlap(expected_cats, actual_cats)
+    if overlap >= 0.50:
+        completed += 1
+        if overlap >= 0.75:
+            completed += 1
+        else:
+            print(
+                f"MISS [10]: category overlap {overlap:.2%} >= 50% but < 75%"
+            )
+    else:
+        print(
+            f"MISS [9-10]: category overlap {overlap:.2%} < 50%. "
+            f"Expected dist: {dict(expected_cats)}, "
+            f"actual dist: {dict(actual_cats)}"
+        )
+
+    return completed, total
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(f"Usage: {sys.argv[0]} <working_directory>")
+        sys.exit(1)
+    completed, total = grade(sys.argv[1])
+    print(f"{completed}/{total}")
+    sys.exit(0 if completed == total else 1)
--- a/backend/benchmark/main.py
+++ b/backend/benchmark/main.py
@ -15,21 +15,21 @@
 import asyncio
 import csv
 import importlib.util
+import shutil
 import sys
 from datetime import datetime
 from pathlib import Path

 from benchmark.client import BenchmarkClient
-from benchmark.environment import BenchmarkConfig
+from benchmark.environment import BenchmarkConfig, ModelKwargs

 DATASET_DIR = Path(__file__).parent / "dataset"
 RESULTS_DIR = Path(__file__).parent
+BROWSER_LOG_DIR = Path(__file__).parent.parent / "browser_log"


 async def run_benchmark(
-    client: BenchmarkClient,
-    benchmark_path: Path,
-    verbose: bool = False
+    client: BenchmarkClient, benchmark_path: Path, verbose: bool = False
 ) -> dict:
    """Load a benchmark config and run it.

@ -43,15 +43,28 @@ async def run_benchmark(
        dict: Results including benchmark name, model, checker and
            grader outcomes.
    """
+    # Clear browser logs so previous benchmark visits don't leak into this run
+    if BROWSER_LOG_DIR.exists():
+        for log_file in BROWSER_LOG_DIR.iterdir():
+            if log_file.is_file():
+                log_file.unlink()
+
    config = BenchmarkConfig.from_json(benchmark_path)
    data = config.data

    model_kwargs = config.model_kwargs
    model = f"{model_kwargs.model_platform}/{model_kwargs.model_type}"
+
+    # Clear previous working directory so results are from a fresh run
+    working_dir_path = Path(data.get_working_directory(model_kwargs))
+    if working_dir_path.exists():
+        shutil.rmtree(working_dir_path)
+        working_dir_path.mkdir(parents=True, exist_ok=True)
+
    print(f"--- Benchmark: {data.name} ---")
    print(f"Question: {data.question}")
    print(f"Model: {model}")
-    print(f"Working directory: {data.get_working_directory(model_kwargs)}")
+    print(f"Working directory: {working_dir_path}")
    print(f"Checkers: {config.tests.checker}")
    print(f"Graders: {config.tests.grader}")

@ -133,6 +146,13 @@ async def main() -> None:
        print(f"No benchmark configs found in {DATASET_DIR}")
        return

+    defaults = ModelKwargs()
+    print("=== Benchmark Model Configuration ===")
+    print(f"  Platform: {defaults.model_platform}")
+    print(f"  Model:    {defaults.model_type}")
+    print(f"  API URL:  {defaults.api_url}")
+    print()
+
    all_results = []
    async with BenchmarkClient() as client:
        for path in paths:
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -38,6 +38,7 @@ dev = [
 [tool.ruff]
 line-length = 79
 target-version = "py311"
+exclude = ["benchmark/answer"]

 [tool.ruff.lint]
 select = [
@ -70,7 +71,7 @@ quote-style = "double"
 indent-style = "space"

 [tool.bandit]
-exclude_dirs = ["tests", ".venv", "venv"]
+exclude_dirs = ["tests", ".venv", "venv", "benchmark/answer"]
 skips = [
    "B101",  # assert_used - OK in non-production code
    "B105",  # hardcoded_password_string - false positive on env var names