mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-05 23:50:14 +00:00
* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
99 lines
No EOL
4 KiB
Python
99 lines
No EOL
4 KiB
Python
# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
|
|
import argparse
|
|
import os
|
|
import requests
|
|
from human_eval.data import write_jsonl, read_problems
|
|
import tqdm
|
|
|
|
from evaluation import filter_code, fix_indents
|
|
from prompts import instruct_prompt
|
|
|
|
def generate_text(api_url,question , model_name, stream=False, auth_token=None):
|
|
headers = {
|
|
'accept': 'application/json',
|
|
'Content-Type': 'application/json',
|
|
# 添加 API Key
|
|
'Authorization' : 'Bearer ' + auth_token if auth_token else ''
|
|
}
|
|
question = instruct_prompt(question)
|
|
data = {
|
|
"messages": [{"content": question, "role": "user"}],
|
|
"model": model_name,
|
|
"stream": stream,
|
|
"temperature": 0.6
|
|
}
|
|
print(f"content: {question}")
|
|
response = requests.post(api_url, headers=headers, json=data,verify=False)
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
return [filter_code(fix_indents(results))]
|
|
else:
|
|
print(f"API Request failed with status code {response.status_code}")
|
|
return None
|
|
|
|
def run_eval_api(
|
|
api_url: str,
|
|
model_name: str,
|
|
out_path: str,
|
|
format_tabs: bool = False,
|
|
auth_token: str = None,
|
|
problem_file: str = None,
|
|
append: bool = False,
|
|
skip: int = 0
|
|
):
|
|
if(problem_file is None):
|
|
problems = read_problems()
|
|
else:
|
|
problems = read_problems(problem_file)
|
|
samples = []
|
|
pbar = tqdm.tqdm(total=len(problems) * 1)
|
|
pbar.update(skip)
|
|
try:
|
|
for task_id in problems:
|
|
# skip some tasks
|
|
if skip > 0:
|
|
skip -= 1
|
|
continue
|
|
|
|
if format_tabs:
|
|
prompt = problems[task_id]["prompt"].replace(" ", "\t")
|
|
else:
|
|
prompt = problems[task_id]["prompt"]
|
|
completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
|
|
# samples.append({"task_id": task_id, "completion": completion})
|
|
for sample in completion:
|
|
result = dict(
|
|
task_id=task_id,
|
|
completion=sample,
|
|
)
|
|
samples += [result]
|
|
if append:
|
|
write_jsonl(out_path, [result],append=append)
|
|
pbar.update(1)
|
|
if not append:
|
|
write_jsonl(out_path, samples,append=append)
|
|
except Exception as e:
|
|
if not append:
|
|
write_jsonl(out_path, samples,append=append)
|
|
print(f"Error: {e}")
|
|
|
|
def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append,skip):
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append,skip)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="API Generate Tester")
|
|
#parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
|
|
parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
|
|
parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
|
|
parser.add_argument("--out_path", type=str, default="results/api/eval_b.jsonl", help="Output Path")
|
|
parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
|
|
parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
|
|
parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
|
|
parser.add_argument("--no_append", action="store_false", help="Append to existing file")
|
|
parser.add_argument("--skip", type=int, default=0, help="Skip first n problems")
|
|
args = parser.parse_args()
|
|
# api_url = "https://api.siliconflow.cn/v1/chat/completions"
|
|
main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append,args.skip) |