eigent/backend/benchmark/harbor/adapter.py
bytecii 639e3764a1
Some checks failed
Test / Run Python Tests (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (javascript-typescript) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Pre-commit / pre-commit (push) Has been cancelled
feat: support harbor for benchmark (#1466)
Co-authored-by: bytecii <bytecii@users.noreply.github.com>
Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
2026-03-08 22:24:53 +08:00

281 lines
10 KiB
Python

# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
from __future__ import annotations
import json
import logging
import re
import shutil
from pathlib import Path
logger = logging.getLogger(__name__)
ADAPTER_DIR = Path(__file__).parent
TEMPLATE_DIR = ADAPTER_DIR / "template"
EVALUATE_SCRIPT = ADAPTER_DIR / "evaluate.py"
BENCHMARK_DIR = ADAPTER_DIR.parent # backend/benchmark/
class EigentBenchAdapter:
"""Converts eigent benchmark dataset JSONs into Harbor task directories."""
NAME = "eigent-bench"
def __init__(
self,
task_dir: Path,
benchmark_dir: Path | None = None,
checker_weight: float = 0.5,
grader_weight: float = 0.5,
) -> None:
self.task_dir = Path(task_dir)
self.benchmark_dir = (
Path(benchmark_dir) if benchmark_dir else BENCHMARK_DIR
)
self.checker_weight = checker_weight
self.grader_weight = grader_weight
self.configs = self._load_configs()
logger.info(
"EigentBenchAdapter initialized: %d tasks, weights=(%s/%s)",
len(self.configs),
self.checker_weight,
self.grader_weight,
)
def _load_configs(self) -> list[dict]:
"""Load all dataset JSON configs."""
dataset_dir = self.benchmark_dir / "dataset"
configs = []
for json_file in sorted(dataset_dir.glob("*.json")):
configs.append(json.loads(json_file.read_text()))
if not configs:
raise ValueError(f"No benchmark configs found in {dataset_dir}")
return configs
def generate_task(
self, index: int, local_task_id: str | None = None
) -> None:
"""Generate a single Harbor task from a benchmark config."""
if index < 0 or index >= len(self.configs):
raise IndexError(
f"Index {index} out of range (0..{len(self.configs) - 1})"
)
config = self.configs[index]
if local_task_id is None:
local_task_id = f"eigent-bench-{index:04d}"
self._prepare_task(config, local_task_id, index)
def generate_all_tasks(self, limit: int | None = None) -> None:
"""Generate all tasks."""
total = (
len(self.configs)
if limit is None
else min(limit, len(self.configs))
)
success_count = 0
fail_count = 0
for index in range(total):
try:
self.generate_task(index)
success_count += 1
except Exception:
fail_count += 1
logger.exception("Failed to generate task %d", index)
logger.info(
"Generation complete: %d succeeded, %d failed out of %d",
success_count,
fail_count,
total,
)
def _prepare_task(
self, config: dict, local_task_id: str, index: int
) -> None:
"""Generate the complete Harbor task directory."""
output_dir = self.task_dir / local_task_id
# 1. Copy template
if output_dir.exists():
shutil.rmtree(output_dir)
shutil.copytree(TEMPLATE_DIR, output_dir)
# 2. Fill in task.toml
metadata = config.get("metadata", {})
self._write_task_toml(output_dir, metadata)
# 3. Fill in instruction.md
data = config["data"]
self._write_instruction(output_dir, data)
# 4. Copy checker/grader scripts into tests/
tests_dir = output_dir / "tests"
tests_dir.mkdir(parents=True, exist_ok=True)
tests_config = config.get("tests", {})
checker_paths = []
for checker_rel in tests_config.get("checker", []):
src = (
self.benchmark_dir.parent / checker_rel
) # relative to backend/
dest = tests_dir / f"checker_{src.stem}.py"
shutil.copy2(src, dest)
checker_paths.append(f"/tests/{dest.name}")
grader_paths = []
for grader_rel in tests_config.get("grader", []):
src = self.benchmark_dir.parent / grader_rel
dest = tests_dir / f"grader_{src.stem}.py"
self._copy_and_patch_grader(src, dest)
grader_paths.append(f"/tests/{dest.name}")
# 5. Copy answer files into tests/answer/{name}/ for graders that need them
task_name = data.get("name", str(index))
answer_src = self.benchmark_dir / "answer" / task_name
if answer_src.exists():
answer_dest = tests_dir / "answer" / task_name
if answer_dest.exists():
shutil.rmtree(answer_dest)
shutil.copytree(answer_src, answer_dest)
# 6. Write config.json for evaluate.py
task_config = {
"task_id": task_name,
"question": data["question"],
"checkers": checker_paths,
"graders": grader_paths,
"checker_weight": self.checker_weight,
"grader_weight": self.grader_weight,
"difficulty": metadata.get("difficulty", ""),
"tags": metadata.get("tags", []),
}
(tests_dir / "config.json").write_text(
json.dumps(task_config, indent=2) + "\n"
)
# 7. Copy evaluate.py into tests/
shutil.copy2(EVALUATE_SCRIPT, tests_dir / "evaluate.py")
# 8. Write solution/solve.sh from answer files
solution_dir = output_dir / "solution"
solution_dir.mkdir(parents=True, exist_ok=True)
answer_dir = self.benchmark_dir / "answer" / task_name
if answer_dir.exists():
self._write_solution(solution_dir, answer_dir)
# 9. Make test.sh executable
test_sh = tests_dir / "test.sh"
if test_sh.exists():
test_sh.chmod(0o755)
logger.info("Generated task: %s", local_task_id)
def _copy_and_patch_grader(self, src: Path, dest: Path) -> None:
"""Copy grader script and patch for Harbor container.
Patches applied:
- Removes browser log checks (milestone #1) and adjusts total count,
since browser logs are eigent-specific and not available for other agents.
- ANSWER_CSV (grader 2): replaces path to resolve relative to copied script.
"""
content = src.read_text()
# Remove BROWSER_LOG_DIR and _visited_urls function entirely
content = re.sub(
r"^BROWSER_LOG_DIR\s*=.*$",
"# BROWSER_LOG_DIR removed — browser checks disabled for Harbor",
content,
flags=re.MULTILINE,
)
content = re.sub(
r"^def _visited_urls\(\).*?^(?=\ndef |\nclass |\n[A-Z])",
"# _visited_urls removed — browser checks disabled for Harbor\n\n",
content,
flags=re.MULTILINE | re.DOTALL,
)
# Comment out the browser URL check block (milestone #1) and reduce total
# Pattern: " # 1. Visited..." through the else/print block ending
content = re.sub(
r"^(\s+# 1\. Visited.*?)(?=\n\s+(?:# 2\.|[a-z_]+ = Path|if not ))",
lambda m: "\n".join(
" # " + line if line.strip() else line
for line in m.group(1).splitlines()
),
content,
flags=re.MULTILINE | re.DOTALL,
)
# Remove "visited = _visited_urls()" if still present
content = re.sub(
r"^\s+visited = _visited_urls\(\)\s*$",
"",
content,
flags=re.MULTILINE,
)
# Reduce total by 1 to account for removed browser milestone
content = re.sub(
r"^(\s+total\s*=\s*)(\d+)",
lambda m: f"{m.group(1)}{int(m.group(2)) - 1}",
content,
flags=re.MULTILINE,
)
# Patch ANSWER_CSV to use path relative to the script in container
content = re.sub(
r'Path\(__file__\)\.resolve\(\)\.parents\[\d+\]\s*/\s*"answer"',
'Path(__file__).resolve().parent / "answer"',
content,
)
# Remove unused imports (urlparse, json for _visited_urls)
# Keep them only if still used elsewhere in the file
for module in ["from urllib.parse import urlparse"]:
if (
module in content
and "urlparse" not in content.split(module, 1)[1]
):
content = content.replace(module + "\n", "")
dest.write_text(content)
def _write_task_toml(self, output_dir: Path, metadata: dict) -> None:
task_toml = output_dir / "task.toml"
content = task_toml.read_text()
difficulty = metadata.get("difficulty", "medium")
tags_list = metadata.get("tags", [])
tags_str = ", ".join(f'"{t}"' for t in ["eigent-bench"] + tags_list)
content = content.replace("{difficulty}", difficulty)
content = content.replace("{tags}", tags_str)
task_toml.write_text(content)
def _write_instruction(self, output_dir: Path, data: dict) -> None:
instruction = output_dir / "instruction.md"
content = instruction.read_text()
content = content.replace("{question}", data["question"])
instruction.write_text(content)
def _write_solution(self, solution_dir: Path, answer_dir: Path) -> None:
"""Create solve.sh that copies answer files to workspace."""
lines = ["#!/bin/bash", ""]
for f in sorted(answer_dir.iterdir()):
if f.is_file():
shutil.copy2(f, solution_dir / f.name)
lines.append(f"cp /solution/{f.name} /workspace/{f.name}")
lines.append("")
solve_sh = solution_dir / "solve.sh"
solve_sh.write_text("\n".join(lines))
solve_sh.chmod(0o755)