support chunk prefill, support 139K context for 24G VRAM

This commit is contained in:
Atream 2025-03-01 11:28:25 +00:00
parent 494469d4c5
commit f35e8d41d8
10 changed files with 227 additions and 83 deletions

View file

@ -23,7 +23,7 @@ class ConfigArgs(BaseModel):
max_batch_size: int = Field(
None, description="Max number of batches to run at once, assuming the sequences will fit within total_context"
)
max_chunk_size: int = Field(
chunk_prefill_size: int = Field(
None,
description=(
"Max chunk size. Determines the size of prefill operations. Can be reduced to reduce pauses whenever a new"