kvcache-ai-ktransformers/archive/ktransformers/tests/AIME_2024/eval_api.py
Jiaqi Liao 57d14d22bc
Refactor: restructure repository to focus on kt-kernel and KT-SFT modulesq recon (#1581)
* refactor: move legacy code to archive/ directory

  - Moved ktransformers, csrc, third_party, merge_tensors to archive/
  - Moved build scripts and configurations to archive/
  - Kept kt-kernel, KT-SFT, doc, and README files in root
  - Preserved complete git history for all moved files

* refactor: restructure repository to focus on kt-kernel and KT-SFT modules

* fix README

* fix README

* fix README

* fix README

* docs: add performance benchmarks to kt-kernel section

Add comprehensive performance data for kt-kernel to match KT-SFT's presentation:
- AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch)
- Prefill phase: up to 20× speedup vs baseline
- Decode phase: up to 4× speedup
- NUMA optimization: up to 63% throughput improvement
- Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8

Source: https://lmsys.org/blog/2025-10-22-KTransformers/

This provides users with concrete performance metrics for both core modules,
making it easier to understand the capabilities of each component.

* refactor: improve kt-kernel performance data with specific hardware and models

Replace generic performance descriptions with concrete benchmarks:
- Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX
- Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B)
- Show detailed metrics: total throughput, output throughput, concurrency details
- Match KT-SFT presentation style for consistency

This provides users with actionable performance data they can use to evaluate
hardware requirements and expected performance for their use cases.

* fix README

* docs: clean up performance table and improve formatting

* add pic for README

* refactor: simplify .gitmodules and backup legacy submodules

- Remove 7 legacy submodules from root .gitmodules (archive/third_party/*)
- Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11)
- Backup complete .gitmodules to archive/.gitmodules
- Add documentation in archive/README.md for researchers who need legacy submodules

This reduces initial clone size by ~500MB and avoids downloading unused dependencies.

* refactor: move doc/ back to root directory

Keep documentation in root for easier access and maintenance.

* refactor: consolidate all images to doc/assets/

- Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/
- Remove KT-SFT/assets/ (images already in doc/assets/)
- Update KT-SFT/README.md image references to ../doc/assets/
- Eliminates ~7.9MB image duplication
- Centralizes all documentation assets in one location

* fix pic path for README
2025-11-10 17:42:26 +08:00

137 lines
No EOL
5 KiB
Python

# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
import argparse
import json
import os
import time
import requests
import tqdm
from evaluation import filter_answer
from prompts import instruct_prompt
import pandas as pd
from datasets import load_dataset
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
def generate_text(api_url,question , model_name, stream=False, auth_token=None):
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
# 添加 API Key
'Authorization' : 'Bearer ' + auth_token if auth_token else ''
}
question = instruct_prompt(question)
data = {
"messages": [{"content": question, "role": "user"}],
"model": model_name,
"stream": stream,
"temperature": 0.6,
"max_tokens": 10240,
}
print(f"content: {question}")
response = requests.post(api_url, headers=headers, json=data,verify=False)
if response.status_code == 200:
result = response.json()
results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
return filter_answer(results)
else:
print(f"API Request failed with status code {response.status_code}")
return None
def load_data(file_path):
"""
Load data from a Parquet file into a list.
Each record in the Parquet file should represent an individual record.
"""
# 读取 Parquet 文件
# dataset = load_dataset('parquet', data_files=file_path)
data = []
ds = load_dataset(file_path)
df = pd.DataFrame(ds['train'])
for _, row in df.iterrows():
data.append(row.to_dict())
return data
def get_score(pred, answer):
"""
Calculate scores between the prediction and the answer.
Uses ROUGE scores as the evaluation metric.
:param pred: The predicted string.
:param answer: The reference answer string.
:return: A dictionary containing ROUGE scores.
"""
if pred == answer:
return 1
# if we need to compare str with number, convert teh str to number
try:
pred = float(pred)
answer = float(answer)
except:
pass
if pred == answer:
return 1
return 0
def run_eval_api(
api_url: str,
model_name: str,
out_path: str,
format_tabs: bool = False,
auth_token: str = None,
problem_file: str = None,
append: bool = False,
skip: int = 0
):
data = load_data(problem_file)
pbar = tqdm.tqdm(total=len(data) * 1)
pbar.update(skip)
for i in range(len(data)):
i = i+skip
data_item = data[i]
question = data_item['Problem']
# Start the timer for this evaluation
start_time = time.time()
try:
completion = generate_text(api_url, question, model_name, auth_token=auth_token)
if completion is None:
raise Exception(f"Failed to get prediction for {question}")
answer = data_item['Answer']
score = get_score(completion, answer)
elapsed_time = time.time() - start_time
result = {
"index": i,
"question_id": data_item["ID"],
"answer": answer,
"prediction": completion,
"score": score,
"time": elapsed_time
}
with open(out_path, "a" if append else "w") as f:
f.write(json.dumps(result) + "\n")
except Exception as e:
print(f"Failed to get prediction for {question}")
print(e)
continue
pbar.update(1)
def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
os.makedirs(os.path.dirname(output_path), exist_ok=True)
run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="API Generate Tester")
parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
parser.add_argument("--no_append", action="store_false", help="Append to existing file")
parser.add_argument("--skip", type=int, default=0, help="Skip some tasks")
args = parser.parse_args()
# api_url = "https://api.siliconflow.cn/v1/chat/completions"
main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append, args.skip)