diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py index 4acaf86..dc5747f 100644 --- a/ktransformers/local_chat.py +++ b/ktransformers/local_chat.py @@ -56,7 +56,7 @@ def local_chat( model_path: str | None = None, optimize_config_path: str = None, gguf_path: str | None = None, - max_new_tokens: int = 300, + max_new_tokens: int = 1000, cpu_infer: int = Config().cpu_infer, use_cuda_graph: bool = True, prompt_file : str | None = None, diff --git a/ktransformers/operators/dynamic_attention.py b/ktransformers/operators/dynamic_attention.py index 13a74b4..2d8b1ef 100644 --- a/ktransformers/operators/dynamic_attention.py +++ b/ktransformers/operators/dynamic_attention.py @@ -26,6 +26,7 @@ import json class DynamicScaledDotProductAttention: remaining_length: int + cpu_infer = None def __init__( self, @@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention: self.preselect_block_num = 0 # block_num before preselect self.evict_tokens = 0 - self.cpu_infer = CPUInfer(threads_num) + if DynamicScaledDotProductAttention.cpu_infer is None: + DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num) + self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer self.local_thread = CPUInferKVCache( self.layer_num, self.kv_head_num, diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 88960c7..c2d5c25 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase): output_gpu_map:dict = {} # Manage output tensor buffer on different gpu #stream_map:dict = {} # Manage cuda stream on different gpu #gguf_loader:GGUFLoader = None - CPU_INFER = CPUInfer(Config().cpu_infer) + CPU_INFER = None def __init__( self, key: str, @@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase): **kwargs ): super().__init__(key, gguf_loader, config, orig_module, device, **kwargs) + if KExpertsCPU.CPU_INFER is None: + KExpertsCPU.CPU_INFER = CPUInfer(Config().cpu_infer) #if KExpertsCPU.gguf_loader is None: # KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf") self.gguf_loader = gguf_loader diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index 103fc1a..5e56e72 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase): self.workspace = None class KLinearCPUInfer(KLinearBase): - CPU_INFER = CPUInfer(Config().cpu_infer) + CPU_INFER = None def __init__( self, key: str, @@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase): **kwargs, ): super().__init__(key, gguf_loader, config, orig_module, device, **kwargs) + if KLinearCPUInfer.CPU_INFER is None: + KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer) self.has_bias = False self.dtype = torch.get_default_dtype() self.w = None