mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 15:29:39 +00:00
fix-singleton
This commit is contained in:
parent
7f57769c23
commit
6f43bbe55f
4 changed files with 11 additions and 4 deletions
|
@ -56,7 +56,7 @@ def local_chat(
|
||||||
model_path: str | None = None,
|
model_path: str | None = None,
|
||||||
optimize_config_path: str = None,
|
optimize_config_path: str = None,
|
||||||
gguf_path: str | None = None,
|
gguf_path: str | None = None,
|
||||||
max_new_tokens: int = 300,
|
max_new_tokens: int = 1000,
|
||||||
cpu_infer: int = Config().cpu_infer,
|
cpu_infer: int = Config().cpu_infer,
|
||||||
use_cuda_graph: bool = True,
|
use_cuda_graph: bool = True,
|
||||||
prompt_file : str | None = None,
|
prompt_file : str | None = None,
|
||||||
|
|
|
@ -26,6 +26,7 @@ import json
|
||||||
|
|
||||||
class DynamicScaledDotProductAttention:
|
class DynamicScaledDotProductAttention:
|
||||||
remaining_length: int
|
remaining_length: int
|
||||||
|
cpu_infer = None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention:
|
||||||
self.preselect_block_num = 0 # block_num before preselect
|
self.preselect_block_num = 0 # block_num before preselect
|
||||||
self.evict_tokens = 0
|
self.evict_tokens = 0
|
||||||
|
|
||||||
self.cpu_infer = CPUInfer(threads_num)
|
if DynamicScaledDotProductAttention.cpu_infer is None:
|
||||||
|
DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num)
|
||||||
|
self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer
|
||||||
self.local_thread = CPUInferKVCache(
|
self.local_thread = CPUInferKVCache(
|
||||||
self.layer_num,
|
self.layer_num,
|
||||||
self.kv_head_num,
|
self.kv_head_num,
|
||||||
|
|
|
@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase):
|
||||||
output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
|
output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
|
||||||
#stream_map:dict = {} # Manage cuda stream on different gpu
|
#stream_map:dict = {} # Manage cuda stream on different gpu
|
||||||
#gguf_loader:GGUFLoader = None
|
#gguf_loader:GGUFLoader = None
|
||||||
CPU_INFER = CPUInfer(Config().cpu_infer)
|
CPU_INFER = None
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
key: str,
|
key: str,
|
||||||
|
@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase):
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
|
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
|
||||||
|
if KExpertsCPU.CPU_INFER is None:
|
||||||
|
KExpertsCPU.CPU_INFER = CPUInfer(Config().cpu_infer)
|
||||||
#if KExpertsCPU.gguf_loader is None:
|
#if KExpertsCPU.gguf_loader is None:
|
||||||
# KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
# KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
|
||||||
self.gguf_loader = gguf_loader
|
self.gguf_loader = gguf_loader
|
||||||
|
|
|
@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase):
|
||||||
self.workspace = None
|
self.workspace = None
|
||||||
|
|
||||||
class KLinearCPUInfer(KLinearBase):
|
class KLinearCPUInfer(KLinearBase):
|
||||||
CPU_INFER = CPUInfer(Config().cpu_infer)
|
CPU_INFER = None
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
key: str,
|
key: str,
|
||||||
|
@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase):
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
|
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
|
||||||
|
if KLinearCPUInfer.CPU_INFER is None:
|
||||||
|
KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer)
|
||||||
self.has_bias = False
|
self.has_bias = False
|
||||||
self.dtype = torch.get_default_dtype()
|
self.dtype = torch.get_default_dtype()
|
||||||
self.w = None
|
self.w = None
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue