mirror of
https://github.com/eigent-ai/eigent.git
synced 2026-05-19 07:59:39 +00:00
Some checks failed
Test / Run Python Tests (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (javascript-typescript) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Pre-commit / pre-commit (push) Has been cancelled
Co-authored-by: bytecii <bytecii@users.noreply.github.com> Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
281 lines
10 KiB
Python
281 lines
10 KiB
Python
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
ADAPTER_DIR = Path(__file__).parent
|
|
TEMPLATE_DIR = ADAPTER_DIR / "template"
|
|
EVALUATE_SCRIPT = ADAPTER_DIR / "evaluate.py"
|
|
BENCHMARK_DIR = ADAPTER_DIR.parent # backend/benchmark/
|
|
|
|
|
|
class EigentBenchAdapter:
|
|
"""Converts eigent benchmark dataset JSONs into Harbor task directories."""
|
|
|
|
NAME = "eigent-bench"
|
|
|
|
def __init__(
|
|
self,
|
|
task_dir: Path,
|
|
benchmark_dir: Path | None = None,
|
|
checker_weight: float = 0.5,
|
|
grader_weight: float = 0.5,
|
|
) -> None:
|
|
self.task_dir = Path(task_dir)
|
|
self.benchmark_dir = (
|
|
Path(benchmark_dir) if benchmark_dir else BENCHMARK_DIR
|
|
)
|
|
self.checker_weight = checker_weight
|
|
self.grader_weight = grader_weight
|
|
self.configs = self._load_configs()
|
|
logger.info(
|
|
"EigentBenchAdapter initialized: %d tasks, weights=(%s/%s)",
|
|
len(self.configs),
|
|
self.checker_weight,
|
|
self.grader_weight,
|
|
)
|
|
|
|
def _load_configs(self) -> list[dict]:
|
|
"""Load all dataset JSON configs."""
|
|
dataset_dir = self.benchmark_dir / "dataset"
|
|
configs = []
|
|
for json_file in sorted(dataset_dir.glob("*.json")):
|
|
configs.append(json.loads(json_file.read_text()))
|
|
if not configs:
|
|
raise ValueError(f"No benchmark configs found in {dataset_dir}")
|
|
return configs
|
|
|
|
def generate_task(
|
|
self, index: int, local_task_id: str | None = None
|
|
) -> None:
|
|
"""Generate a single Harbor task from a benchmark config."""
|
|
if index < 0 or index >= len(self.configs):
|
|
raise IndexError(
|
|
f"Index {index} out of range (0..{len(self.configs) - 1})"
|
|
)
|
|
|
|
config = self.configs[index]
|
|
if local_task_id is None:
|
|
local_task_id = f"eigent-bench-{index:04d}"
|
|
|
|
self._prepare_task(config, local_task_id, index)
|
|
|
|
def generate_all_tasks(self, limit: int | None = None) -> None:
|
|
"""Generate all tasks."""
|
|
total = (
|
|
len(self.configs)
|
|
if limit is None
|
|
else min(limit, len(self.configs))
|
|
)
|
|
success_count = 0
|
|
fail_count = 0
|
|
for index in range(total):
|
|
try:
|
|
self.generate_task(index)
|
|
success_count += 1
|
|
except Exception:
|
|
fail_count += 1
|
|
logger.exception("Failed to generate task %d", index)
|
|
logger.info(
|
|
"Generation complete: %d succeeded, %d failed out of %d",
|
|
success_count,
|
|
fail_count,
|
|
total,
|
|
)
|
|
|
|
def _prepare_task(
|
|
self, config: dict, local_task_id: str, index: int
|
|
) -> None:
|
|
"""Generate the complete Harbor task directory."""
|
|
output_dir = self.task_dir / local_task_id
|
|
|
|
# 1. Copy template
|
|
if output_dir.exists():
|
|
shutil.rmtree(output_dir)
|
|
shutil.copytree(TEMPLATE_DIR, output_dir)
|
|
|
|
# 2. Fill in task.toml
|
|
metadata = config.get("metadata", {})
|
|
self._write_task_toml(output_dir, metadata)
|
|
|
|
# 3. Fill in instruction.md
|
|
data = config["data"]
|
|
self._write_instruction(output_dir, data)
|
|
|
|
# 4. Copy checker/grader scripts into tests/
|
|
tests_dir = output_dir / "tests"
|
|
tests_dir.mkdir(parents=True, exist_ok=True)
|
|
tests_config = config.get("tests", {})
|
|
|
|
checker_paths = []
|
|
for checker_rel in tests_config.get("checker", []):
|
|
src = (
|
|
self.benchmark_dir.parent / checker_rel
|
|
) # relative to backend/
|
|
dest = tests_dir / f"checker_{src.stem}.py"
|
|
shutil.copy2(src, dest)
|
|
checker_paths.append(f"/tests/{dest.name}")
|
|
|
|
grader_paths = []
|
|
for grader_rel in tests_config.get("grader", []):
|
|
src = self.benchmark_dir.parent / grader_rel
|
|
dest = tests_dir / f"grader_{src.stem}.py"
|
|
self._copy_and_patch_grader(src, dest)
|
|
grader_paths.append(f"/tests/{dest.name}")
|
|
|
|
# 5. Copy answer files into tests/answer/{name}/ for graders that need them
|
|
task_name = data.get("name", str(index))
|
|
answer_src = self.benchmark_dir / "answer" / task_name
|
|
if answer_src.exists():
|
|
answer_dest = tests_dir / "answer" / task_name
|
|
if answer_dest.exists():
|
|
shutil.rmtree(answer_dest)
|
|
shutil.copytree(answer_src, answer_dest)
|
|
|
|
# 6. Write config.json for evaluate.py
|
|
task_config = {
|
|
"task_id": task_name,
|
|
"question": data["question"],
|
|
"checkers": checker_paths,
|
|
"graders": grader_paths,
|
|
"checker_weight": self.checker_weight,
|
|
"grader_weight": self.grader_weight,
|
|
"difficulty": metadata.get("difficulty", ""),
|
|
"tags": metadata.get("tags", []),
|
|
}
|
|
(tests_dir / "config.json").write_text(
|
|
json.dumps(task_config, indent=2) + "\n"
|
|
)
|
|
|
|
# 7. Copy evaluate.py into tests/
|
|
shutil.copy2(EVALUATE_SCRIPT, tests_dir / "evaluate.py")
|
|
|
|
# 8. Write solution/solve.sh from answer files
|
|
solution_dir = output_dir / "solution"
|
|
solution_dir.mkdir(parents=True, exist_ok=True)
|
|
answer_dir = self.benchmark_dir / "answer" / task_name
|
|
if answer_dir.exists():
|
|
self._write_solution(solution_dir, answer_dir)
|
|
|
|
# 9. Make test.sh executable
|
|
test_sh = tests_dir / "test.sh"
|
|
if test_sh.exists():
|
|
test_sh.chmod(0o755)
|
|
|
|
logger.info("Generated task: %s", local_task_id)
|
|
|
|
def _copy_and_patch_grader(self, src: Path, dest: Path) -> None:
|
|
"""Copy grader script and patch for Harbor container.
|
|
|
|
Patches applied:
|
|
- Removes browser log checks (milestone #1) and adjusts total count,
|
|
since browser logs are eigent-specific and not available for other agents.
|
|
- ANSWER_CSV (grader 2): replaces path to resolve relative to copied script.
|
|
"""
|
|
content = src.read_text()
|
|
|
|
# Remove BROWSER_LOG_DIR and _visited_urls function entirely
|
|
content = re.sub(
|
|
r"^BROWSER_LOG_DIR\s*=.*$",
|
|
"# BROWSER_LOG_DIR removed — browser checks disabled for Harbor",
|
|
content,
|
|
flags=re.MULTILINE,
|
|
)
|
|
content = re.sub(
|
|
r"^def _visited_urls\(\).*?^(?=\ndef |\nclass |\n[A-Z])",
|
|
"# _visited_urls removed — browser checks disabled for Harbor\n\n",
|
|
content,
|
|
flags=re.MULTILINE | re.DOTALL,
|
|
)
|
|
|
|
# Comment out the browser URL check block (milestone #1) and reduce total
|
|
# Pattern: " # 1. Visited..." through the else/print block ending
|
|
content = re.sub(
|
|
r"^(\s+# 1\. Visited.*?)(?=\n\s+(?:# 2\.|[a-z_]+ = Path|if not ))",
|
|
lambda m: "\n".join(
|
|
" # " + line if line.strip() else line
|
|
for line in m.group(1).splitlines()
|
|
),
|
|
content,
|
|
flags=re.MULTILINE | re.DOTALL,
|
|
)
|
|
# Remove "visited = _visited_urls()" if still present
|
|
content = re.sub(
|
|
r"^\s+visited = _visited_urls\(\)\s*$",
|
|
"",
|
|
content,
|
|
flags=re.MULTILINE,
|
|
)
|
|
|
|
# Reduce total by 1 to account for removed browser milestone
|
|
content = re.sub(
|
|
r"^(\s+total\s*=\s*)(\d+)",
|
|
lambda m: f"{m.group(1)}{int(m.group(2)) - 1}",
|
|
content,
|
|
flags=re.MULTILINE,
|
|
)
|
|
|
|
# Patch ANSWER_CSV to use path relative to the script in container
|
|
content = re.sub(
|
|
r'Path\(__file__\)\.resolve\(\)\.parents\[\d+\]\s*/\s*"answer"',
|
|
'Path(__file__).resolve().parent / "answer"',
|
|
content,
|
|
)
|
|
|
|
# Remove unused imports (urlparse, json for _visited_urls)
|
|
# Keep them only if still used elsewhere in the file
|
|
for module in ["from urllib.parse import urlparse"]:
|
|
if (
|
|
module in content
|
|
and "urlparse" not in content.split(module, 1)[1]
|
|
):
|
|
content = content.replace(module + "\n", "")
|
|
|
|
dest.write_text(content)
|
|
|
|
def _write_task_toml(self, output_dir: Path, metadata: dict) -> None:
|
|
task_toml = output_dir / "task.toml"
|
|
content = task_toml.read_text()
|
|
difficulty = metadata.get("difficulty", "medium")
|
|
tags_list = metadata.get("tags", [])
|
|
tags_str = ", ".join(f'"{t}"' for t in ["eigent-bench"] + tags_list)
|
|
content = content.replace("{difficulty}", difficulty)
|
|
content = content.replace("{tags}", tags_str)
|
|
task_toml.write_text(content)
|
|
|
|
def _write_instruction(self, output_dir: Path, data: dict) -> None:
|
|
instruction = output_dir / "instruction.md"
|
|
content = instruction.read_text()
|
|
content = content.replace("{question}", data["question"])
|
|
instruction.write_text(content)
|
|
|
|
def _write_solution(self, solution_dir: Path, answer_dir: Path) -> None:
|
|
"""Create solve.sh that copies answer files to workspace."""
|
|
lines = ["#!/bin/bash", ""]
|
|
for f in sorted(answer_dir.iterdir()):
|
|
if f.is_file():
|
|
shutil.copy2(f, solution_dir / f.name)
|
|
lines.append(f"cp /solution/{f.name} /workspace/{f.name}")
|
|
lines.append("")
|
|
solve_sh = solution_dir / "solve.sh"
|
|
solve_sh.write_text("\n".join(lines))
|
|
solve_sh.chmod(0o755)
|