Refactor: restructure repository to focus on kt-kernel and KT-SFT modulesq recon (#1581)

* refactor: move legacy code to archive/ directory - Moved ktransformers, csrc, third_party, merge_tensors to archive/ - Moved build scripts and configurations to archive/ - Kept kt-kernel, KT-SFT, doc, and README files in root - Preserved complete git history for all moved files * refactor: restructure repository to focus on kt-kernel and KT-SFT modules * fix README * fix README * fix README * fix README * docs: add performance benchmarks to kt-kernel section Add comprehensive performance data for kt-kernel to match KT-SFT's presentation: - AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch) - Prefill phase: up to 20× speedup vs baseline - Decode phase: up to 4× speedup - NUMA optimization: up to 63% throughput improvement - Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8 Source: https://lmsys.org/blog/2025-10-22-KTransformers/ This provides users with concrete performance metrics for both core modules, making it easier to understand the capabilities of each component. * refactor: improve kt-kernel performance data with specific hardware and models Replace generic performance descriptions with concrete benchmarks: - Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX - Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B) - Show detailed metrics: total throughput, output throughput, concurrency details - Match KT-SFT presentation style for consistency This provides users with actionable performance data they can use to evaluate hardware requirements and expected performance for their use cases. * fix README * docs: clean up performance table and improve formatting * add pic for README * refactor: simplify .gitmodules and backup legacy submodules - Remove 7 legacy submodules from root .gitmodules (archive/third_party/*) - Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11) - Backup complete .gitmodules to archive/.gitmodules - Add documentation in archive/README.md for researchers who need legacy submodules This reduces initial clone size by ~500MB and avoids downloading unused dependencies. * refactor: move doc/ back to root directory Keep documentation in root for easier access and maintenance. * refactor: consolidate all images to doc/assets/ - Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/ - Remove KT-SFT/assets/ (images already in doc/assets/) - Update KT-SFT/README.md image references to ../doc/assets/ - Eliminates ~7.9MB image duplication - Centralizes all documentation assets in one location * fix pic path for README
2026-05-05 15:40:13 +00:00 · 2025-11-10 17:42:26 +08:00 · 2025-11-10 17:42:26 +08:00 · 57d14d22bc
commit 57d14d22bc
parent 8729435d85
510 changed files with 711 additions and 334 deletions
--- a/archive/ktransformers/tests/test_prefix.py
+++ b/archive/ktransformers/tests/test_prefix.py
@ -0,0 +1,132 @@
+import asyncio
+import json
+import sys
+import aiohttp
+import random
+import argparse
+import yaml
+import os
+import time
+from time import sleep
+
+decodesz = 128
+# Server URL (replace with your server URL)
+decodesz_list = [128]
+prefill_speeds = []
+decode_speeds = []
+
+async def fetch_message_once(session, request_id, messages, max_tokens, model):
+    try:
+        payload = {
+            "messages": messages,
+            "model": model,
+            "temperature": 0.3,
+            "top_p": 1.0,
+            "stream": True,
+            "return_speed": True,
+            "max_tokens": max_tokens,
+        }
+
+        headers = {
+            'accept': 'application/json',
+            'Content-Type': 'application/json'
+        }
+
+        async with session.post(SERVER_URL, json=payload, headers=headers, timeout=500000) as response:
+            if response.status != 200:
+                print(f"[Request {request_id}] Error: Status {response.status}")
+                return None, None, None
+
+            buffer = ""
+            usage_info = None
+            answer = ""
+
+            async for line in response.content:
+                decoded_line = line.decode("utf-8").strip()
+                if not decoded_line or not decoded_line.startswith("data: "):
+                    continue
+
+                decoded_line = decoded_line[6:].strip()
+                if not decoded_line:
+                    continue
+
+                response_data = json.loads(decoded_line)
+
+                if "usage" in response_data:
+                    usage_info = response_data["usage"]
+
+                choices = response_data.get("choices", [])
+                if not choices:
+                    continue
+
+                delta = choices[0].get("delta", {})
+                token = delta.get("content", "")
+                if token:
+                    buffer += token
+                    answer += token
+
+                finish_reason = choices[0].get("finish_reason", None)
+                if finish_reason:
+                    break
+
+            return answer.strip(), usage_info, buffer.strip()
+
+    except Exception as e:
+        print(f"[Request {request_id}] Exception: {e}")
+        return None, None, None
+
+
+async def multi_turn_conversation(session, request_id, rounds, max_tokens, model):
+    prompt = ["介绍一下秦始皇", "秦始皇的成就有哪些", "秦始皇的历史影响", "介绍一下秦始皇的陵墓", "秦始皇的统一措施", "秦始皇的政治制度", "秦始皇的文化政策", "秦始皇的军事行动"]
+    
+    messages = [{"role": "system", "content": ""}]
+    global prefill_speeds, decode_speeds
+
+    for i in range(rounds):
+        user_msg = f"这是第{i + 1}轮对话，请回答以下问题：{prompt[i % len(prompt)]}"
+        messages.append({"role": "user", "content": user_msg})
+        print(f"\n[Request {request_id}] >> User: {user_msg}")
+
+        answer, usage_info, _ = await fetch_message_once(session, request_id, messages, max_tokens, model)
+        if answer:
+            messages.append({"role": "user", "content": answer})
+            print(f"[Request {request_id}] << Assistant: {answer}")
+
+        if usage_info:
+            prefill_speed = usage_info["prompt_tokens"] / usage_info["prefill_time"]
+            decode_speed = usage_info["completion_tokens"] / usage_info["decode_time"]
+            prefill_speeds.append(prefill_speed)
+            decode_speeds.append(decode_speed)
+            print(f'[Request {request_id}] prefill speed: {prefill_speed}')
+            print(f'[Request {request_id}] decode speed: {decode_speed}')
+
+
+async def main(concurrent_requests, rounds, max_tokens, model):
+    async with aiohttp.ClientSession() as session:
+        tasks = [multi_turn_conversation(session, i, rounds, max_tokens, model) for i in range(concurrent_requests)]
+        await asyncio.gather(*tasks)
+
+    if prefill_speeds:
+        import numpy as np
+        print(f"\n=== Summary ===")
+        print(f"Total concurrency: {concurrent_requests}")
+        print(f"Avg prefill speed: {np.mean(prefill_speeds)}")
+        print(f"Avg decode speed: {np.mean(decode_speeds)}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Event Stream Request Tester")
+    parser.add_argument("--concurrent", type=int, default=1, help="Number of concurrent requests")
+    parser.add_argument("--model", type=str, default="DeepSeek-V3", help="Model name")
+    parser.add_argument("--prompt_lens", type=int, default=1024, help="prefill prompt lens, 1024 or 2048")
+    parser.add_argument("--api_url", type=str, default="http://localhost:10002/v1/chat/completions", help="API URL")
+    parser.add_argument("--max_tokens", type=int, default=50, help="max decode tokens")
+    parser.add_argument("--rounds", type=int, default=8, help="Number of multi-turn rounds (before final query)")    
+    
+    args = parser.parse_args()
+    SERVER_URL = args.api_url
+    max_tokens = args.max_tokens
+    model = args.model
+
+    asyncio.run(main(args.concurrent, args.rounds, max_tokens, model))
+