Fix bug with non-base-multiple chunk_size, update test examples, and resolve issue with writing model_config. Hugging Face URL input is still unsupported.

This commit is contained in:
dongjw 2025-04-04 15:41:07 +08:00
parent 64e6aa026a
commit be84d04253
4 changed files with 65 additions and 71 deletions

View file

@ -43,10 +43,10 @@ class KDeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
def init_wrapper(self, use_cuda_graph, device, max_batch_size, max_pages):
self.use_cuda_graph = use_cuda_graph
self.workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8).to(0)
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.qo_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
self.paged_kv_indptr_buf = torch.empty((max_batch_size+2,), dtype=torch.int32, device=device)
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
self.paged_kv_len_buf = torch.empty((max_batch_size,), dtype=torch.int32, device=device)
self.paged_kv_len_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.bsz_tensor_buf = torch.empty((1, ), dtype=torch.int32, device=device)