mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-28 11:49:51 +00:00
* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
105 lines
4.1 KiB
Python
105 lines
4.1 KiB
Python
import asyncio
|
|
import json
|
|
import sys
|
|
import aiohttp
|
|
import argparse
|
|
|
|
prompt_list = [
|
|
'Please elaborate on modern world history.',
|
|
'Please introduce Harry Potter.',
|
|
'I want to learn Python. Please give me some advice.',
|
|
'Please tell me a joke '
|
|
]
|
|
|
|
|
|
async def fetch_event_stream(session, payload, request_id, stream):
|
|
try:
|
|
headers = {
|
|
'accept': 'application/json',
|
|
'Content-Type': 'application/json'
|
|
}
|
|
|
|
async with session.post(SERVER_URL, json=payload, headers=headers, timeout=50000) as response:
|
|
print(f"Request {request_id}: Connected, status {response.status}")
|
|
|
|
if response.status != 200:
|
|
print(f"Request {request_id}: Error, status {response.status}")
|
|
return
|
|
|
|
output_text = ""
|
|
|
|
if stream:
|
|
async for line in response.content:
|
|
try:
|
|
decoded_line = line.decode("utf-8").strip()
|
|
if not decoded_line or not decoded_line.startswith("data: "):
|
|
continue
|
|
|
|
decoded_line = decoded_line[6:].strip()
|
|
if not decoded_line:
|
|
continue
|
|
|
|
response_data = json.loads(decoded_line)
|
|
choices = response_data.get("choices", [])
|
|
if not choices:
|
|
continue
|
|
|
|
delta = choices[0].get("delta", {})
|
|
token = delta.get("content", "")
|
|
|
|
if token:
|
|
output_text += token
|
|
sys.stdout.write(token)
|
|
sys.stdout.flush()
|
|
|
|
finish_reason = choices[0].get("finish_reason", None)
|
|
if finish_reason:
|
|
break
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"\nRequest {request_id}: JSON Decode Error - {e}")
|
|
except IndexError:
|
|
print(f"\nRequest {request_id}: List Index Error - choices is empty")
|
|
except Exception as e:
|
|
print(f"\nRequest {request_id}: Error parsing stream - {e}")
|
|
else:
|
|
# 非 stream 模式下,一次性接收完整 json
|
|
response_data = await response.json()
|
|
choices = response_data.get("choices", [])
|
|
if choices:
|
|
content = choices[0].get("message", {}).get("content", "")
|
|
print(f"Request {request_id} Output:\n{content}")
|
|
output_text += content
|
|
|
|
except Exception as e:
|
|
print(f"\nRequest {request_id}: Exception - {e}")
|
|
|
|
async def main(prompt_id, model, stream, max_tokens, temperature, top_p):
|
|
async with aiohttp.ClientSession() as session:
|
|
payload = {
|
|
"messages": [
|
|
{"role": "system", "content": ""},
|
|
{"role": "user", "content": prompt_list[prompt_id]}
|
|
],
|
|
"model": model,
|
|
"stream": stream,
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"top_p": top_p
|
|
}
|
|
tasks = [fetch_event_stream(session, payload, prompt_id, stream)]
|
|
await asyncio.gather(*tasks)
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Event Stream Request Tester")
|
|
parser.add_argument("--question_id", type=int, default=0)
|
|
parser.add_argument("--model", type=str, default="DeepSeek-V3")
|
|
parser.add_argument("--stream", type=bool, default=True)
|
|
parser.add_argument("--max_tokens", type=int, default=500)
|
|
parser.add_argument("--temperature", type=float, default=0.8)
|
|
parser.add_argument("--top_p", type=float, default=1)
|
|
parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
|
|
|
|
args = parser.parse_args()
|
|
SERVER_URL = args.api_url
|
|
asyncio.run(main(args.question_id, args.model, args.stream, args.max_tokens, args.temperature, args.top_p))
|