# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. ========= # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. ========= import asyncio import csv import importlib.util import shutil import sys from datetime import datetime from pathlib import Path from benchmark.client import BenchmarkClient from benchmark.environment import BenchmarkConfig, ModelKwargs DATASET_DIR = Path(__file__).parent / "dataset" RESULTS_DIR = Path(__file__).parent BROWSER_LOG_DIR = Path(__file__).parent.parent / "browser_log" async def run_benchmark( client: BenchmarkClient, benchmark_path: Path, verbose: bool = False ) -> dict: """Load a benchmark config and run it. Args: client (BenchmarkClient): BenchmarkClient instance for API communication. benchmark_path (Path): Path to the benchmark JSON config file. verbose (bool): If True, print SSE events during the run. Returns: dict: Results including benchmark name, model, checker and grader outcomes. """ # Clear browser logs so previous benchmark visits don't leak into this run if BROWSER_LOG_DIR.exists(): for log_file in BROWSER_LOG_DIR.iterdir(): if log_file.is_file(): log_file.unlink() config = BenchmarkConfig.from_json(benchmark_path) data = config.data model_kwargs = config.model_kwargs model = f"{model_kwargs.model_platform}/{model_kwargs.model_type}" # Clear previous working directory so results are from a fresh run working_dir_path = Path(data.get_working_directory(model_kwargs)) if working_dir_path.exists(): shutil.rmtree(working_dir_path) working_dir_path.mkdir(parents=True, exist_ok=True) print(f"--- Benchmark: {data.name} ---") print(f"Question: {data.question}") print(f"Model: {model}") print(f"Working directory: {working_dir_path}") print(f"Checkers: {config.tests.checker}") print(f"Graders: {config.tests.grader}") events = await client.run(data, model_kwargs=model_kwargs, verbose=verbose) print(f"\n--- Done: {data.name} ({len(events)} events) ---") working_dir = data.get_working_directory(model_kwargs) checker_results = [] for checker_path in config.tests.checker: print(f"Running checker: {checker_path}") spec = importlib.util.spec_from_file_location("checker", checker_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) passed = module.check(working_dir) print(f" Result: {'PASS' if passed else 'FAIL'}") checker_results.append((checker_path, passed)) grader_results = [] for grader_path in config.tests.grader: print(f"Running grader: {grader_path}") spec = importlib.util.spec_from_file_location("grader", grader_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) completed, total = module.grade(working_dir) print(f" Progress: {completed}/{total}") grader_results.append((grader_path, completed, total)) print() return { "benchmark": data.name, "model": model, "checkers": checker_results, "graders": grader_results, } def _write_results_csv(all_results: list[dict]) -> Path: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_path = RESULTS_DIR / f"{timestamp}_results.csv" with open(csv_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["benchmark", "model", "type", "script", "result"]) for result in all_results: for script, passed in result["checkers"]: writer.writerow( [ result["benchmark"], result["model"], "checker", script, "PASS" if passed else "FAIL", ] ) for script, completed, total in result["graders"]: writer.writerow( [ result["benchmark"], result["model"], "grader", script, f"{completed}/{total}", ] ) return csv_path async def main() -> None: verbose: bool = "--verbose" in sys.argv or "-v" in sys.argv args: list[str] = [a for a in sys.argv[1:] if a not in ("--verbose", "-v")] if args: paths = [Path(p) for p in args] else: paths = sorted(DATASET_DIR.glob("*.json")) if not paths: print(f"No benchmark configs found in {DATASET_DIR}") return defaults = ModelKwargs() print("=== Benchmark Model Configuration ===") print(f" Platform: {defaults.model_platform}") print(f" Model: {defaults.model_type}") print(f" API URL: {defaults.api_url}") print() all_results = [] async with BenchmarkClient() as client: for path in paths: result = await run_benchmark(client, path, verbose=verbose) all_results.append(result) csv_path = _write_results_csv(all_results) print(f"Results saved to {csv_path}") if __name__ == "__main__": asyncio.run(main())