support chunk prefill, support 139K context for 24G VRAM

2025-09-10 14:51:06 +00:00 · 2025-03-01 11:28:25 +00:00 · 2025-03-01 11:28:25 +00:00 · f35e8d41d8
commit f35e8d41d8
parent 494469d4c5
10 changed files with 227 additions and 83 deletions
--- a/ktransformers/server/backend/args.py
+++ b/ktransformers/server/backend/args.py
@ -23,7 +23,7 @@ class ConfigArgs(BaseModel):
    max_batch_size: int = Field(
        None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
    )
-    max_chunk_size: int = Field(
+    chunk_prefill_size: int = Field(
        None,
        description=(
            "Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"