eigent/backend/benchmark/main.py

# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========

import asyncio
import csv
import importlib.util
import shutil
import sys
from datetime import datetime
from pathlib import Path

from benchmark.client import BenchmarkClient
from benchmark.environment import BenchmarkConfig, ModelKwargs

DATASET_DIR = Path(__file__).parent / "dataset"
RESULTS_DIR = Path(__file__).parent
BROWSER_LOG_DIR = Path(__file__).parent.parent / "browser_log"


async def run_benchmark(
    client: BenchmarkClient, benchmark_path: Path, verbose: bool = False
) -> dict:
    """Load a benchmark config and run it.

    Args:
        client (BenchmarkClient): BenchmarkClient instance for API
            communication.
        benchmark_path (Path): Path to the benchmark JSON config file.
        verbose (bool): If True, print SSE events during the run.

    Returns:
        dict: Results including benchmark name, model, checker and
            grader outcomes.
    """
    # Clear browser logs so previous benchmark visits don't leak into this run
    if BROWSER_LOG_DIR.exists():
        for log_file in BROWSER_LOG_DIR.iterdir():
            if log_file.is_file():
                log_file.unlink()

    config = BenchmarkConfig.from_json(benchmark_path)
    data = config.data

    model_kwargs = config.model_kwargs
    model = f"{model_kwargs.model_platform}/{model_kwargs.model_type}"

    # Clear previous working directory so results are from a fresh run
    working_dir_path = Path(data.get_working_directory(model_kwargs))
    if working_dir_path.exists():
        shutil.rmtree(working_dir_path)
        working_dir_path.mkdir(parents=True, exist_ok=True)

    print(f"--- Benchmark: {data.name} ---")
    print(f"Question: {data.question}")
    print(f"Model: {model}")
    print(f"Working directory: {working_dir_path}")
    print(f"Checkers: {config.tests.checker}")
    print(f"Graders: {config.tests.grader}")

    events = await client.run(data, model_kwargs=model_kwargs, verbose=verbose)
    print(f"\n--- Done: {data.name} ({len(events)} events) ---")

    working_dir = data.get_working_directory(model_kwargs)
    checker_results = []
    for checker_path in config.tests.checker:
        print(f"Running checker: {checker_path}")
        spec = importlib.util.spec_from_file_location("checker", checker_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        passed = module.check(working_dir)
        print(f"  Result: {'PASS' if passed else 'FAIL'}")
        checker_results.append((checker_path, passed))

    grader_results = []
    for grader_path in config.tests.grader:
        print(f"Running grader: {grader_path}")
        spec = importlib.util.spec_from_file_location("grader", grader_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        completed, total = module.grade(working_dir)
        print(f"  Progress: {completed}/{total}")
        grader_results.append((grader_path, completed, total))

    print()
    return {
        "benchmark": data.name,
        "model": model,
        "checkers": checker_results,
        "graders": grader_results,
    }


def _write_results_csv(all_results: list[dict]) -> Path:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_path = RESULTS_DIR / f"{timestamp}_results.csv"

    with open(csv_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["benchmark", "model", "type", "script", "result"])
        for result in all_results:
            for script, passed in result["checkers"]:
                writer.writerow(
                    [
                        result["benchmark"],
                        result["model"],
                        "checker",
                        script,
                        "PASS" if passed else "FAIL",
                    ]
                )
            for script, completed, total in result["graders"]:
                writer.writerow(
                    [
                        result["benchmark"],
                        result["model"],
                        "grader",
                        script,
                        f"{completed}/{total}",
                    ]
                )

    return csv_path


async def main() -> None:
    verbose: bool = "--verbose" in sys.argv or "-v" in sys.argv
    args: list[str] = [a for a in sys.argv[1:] if a not in ("--verbose", "-v")]

    if args:
        paths = [Path(p) for p in args]
    else:
        paths = sorted(DATASET_DIR.glob("*.json"))

    if not paths:
        print(f"No benchmark configs found in {DATASET_DIR}")
        return

    defaults = ModelKwargs()
    print("=== Benchmark Model Configuration ===")
    print(f"  Platform: {defaults.model_platform}")
    print(f"  Model:    {defaults.model_type}")
    print(f"  API URL:  {defaults.api_url}")
    print()

    all_results = []
    async with BenchmarkClient() as client:
        for path in paths:
            result = await run_benchmark(client, path, verbose=verbose)
            all_results.append(result)

    csv_path = _write_results_csv(all_results)
    print(f"Results saved to {csv_path}")


if __name__ == "__main__":
    asyncio.run(main())