eigent/backend/benchmark/grader/1.py
bytecii f7bf29a40a
benchmark: update benchmark (#1207)
Co-authored-by: bytecii <bytecii@users.noreply.github.com>
Co-authored-by: Wendong-Fan <w3ndong.fan@gmail.com>
Co-authored-by: Wendong-Fan <133094783+Wendong-Fan@users.noreply.github.com>
2026-02-12 16:35:18 +08:00

139 lines
4.3 KiB
Python

# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
"""Grader for benchmark 1: evaluate python313_features.md milestones."""
import json
import re
import sys
from pathlib import Path
from urllib.parse import urlparse
BROWSER_LOG_DIR = Path(__file__).resolve().parents[2] / "browser_log"
def _visited_urls() -> set[str]:
"""Extract all URLs seen in browser logs."""
urls: set[str] = set()
if not BROWSER_LOG_DIR.exists():
return urls
for log_file in BROWSER_LOG_DIR.glob("hybrid_browser_toolkit_ws_*.log"):
decoder = json.JSONDecoder()
raw = log_file.read_text()
pos = 0
while pos < len(raw):
stripped = raw[pos:].lstrip()
if not stripped:
break
pos = len(raw) - len(stripped)
try:
obj, end = decoder.raw_decode(raw, pos)
pos = end
if not isinstance(obj, dict):
continue
action = obj.get("action", "")
if action == "visit_page":
args = obj.get("inputs", {}).get("args", [])
if args:
urls.add(args[0])
except (json.JSONDecodeError, ValueError):
pos += 1
return urls
def grade(working_directory: str) -> tuple[int, int]:
total = 7
completed = 0
md_file = Path(working_directory) / "python313_features.md"
# 1. Visited the Python 3.13 What's New page
visited = _visited_urls()
if any(
(p := urlparse(u)).hostname is not None
and (
p.hostname == "docs.python.org"
or p.hostname.endswith(".docs.python.org")
)
and "3.13" in p.path
for u in visited
):
completed += 1
else:
print("MISS [1]: did not visit docs.python.org/3.13 What's New page")
if not md_file.exists():
print("MISS [2-7]: python313_features.md does not exist")
return completed, total
content = md_file.read_text()
lower = content.lower()
# 2. Has a # warnings heading
if re.search(r"^# warnings\b", content, re.MULTILINE | re.IGNORECASE):
completed += 1
else:
print("MISS [2]: no '# warnings' heading found")
# 3. Has a # multiprocessing heading
if re.search(
r"^# multiprocessing\b", content, re.MULTILINE | re.IGNORECASE
):
completed += 1
else:
print("MISS [3]: no '# multiprocessing' heading found")
# 4. Mentions warnings.deprecated() with backticks
if "`warnings.deprecated()`" in content or (
"warnings.deprecated" in lower and "`" in content
):
completed += 1
else:
print(
"MISS [4]: missing `warnings.deprecated()` "
"(expected backtick-wrapped reference)"
)
# 5. Mentions PEP 702
if "pep 702" in lower:
completed += 1
else:
print("MISS [5]: no mention of PEP 702")
# 6. Mentions os.process_cpu_count() with backticks
if "`os.process_cpu_count()`" in content or (
"os.process_cpu_count" in lower and "`" in content
):
completed += 1
else:
print(
"MISS [6]: missing `os.process_cpu_count()` "
"(expected backtick-wrapped reference)"
)
# 7. Mentions os.cpu_count() (the old default being replaced)
if "os.cpu_count" in lower:
completed += 1
else:
print("MISS [7]: no mention of os.cpu_count()")
return completed, total
if __name__ == "__main__":
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <working_directory>")
sys.exit(1)
completed, total = grade(sys.argv[1])
print(f"{completed}/{total}")
sys.exit(0 if completed == total else 1)