diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py
index 4acaf86..dc5747f 100644
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@@ -56,7 +56,7 @@ def local_chat(
     model_path: str | None = None,
     optimize_config_path: str = None,
     gguf_path: str | None = None,
-    max_new_tokens: int = 300,
+    max_new_tokens: int = 1000,
     cpu_infer: int = Config().cpu_infer,
     use_cuda_graph: bool = True,
     prompt_file : str | None = None,
diff --git a/ktransformers/operators/dynamic_attention.py b/ktransformers/operators/dynamic_attention.py
index 13a74b4..2d8b1ef 100644
--- a/ktransformers/operators/dynamic_attention.py
+++ b/ktransformers/operators/dynamic_attention.py
@@ -26,6 +26,7 @@ import json
 
 class DynamicScaledDotProductAttention:
     remaining_length: int
+    cpu_infer = None
 
     def __init__(
         self,
@@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention:
             self.preselect_block_num = 0  # block_num before preselect
             self.evict_tokens = 0
 
-        self.cpu_infer = CPUInfer(threads_num)
+        if DynamicScaledDotProductAttention.cpu_infer is None:
+            DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num)
+            self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer
         self.local_thread = CPUInferKVCache(
             self.layer_num,
             self.kv_head_num,
diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py
index 88960c7..c2d5c25 100644
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase):
     output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
     #stream_map:dict = {} # Manage cuda stream on different gpu
     #gguf_loader:GGUFLoader = None
-    CPU_INFER = CPUInfer(Config().cpu_infer)
+    CPU_INFER = None
     def __init__(
         self,
         key: str,
@@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase):
         **kwargs
     ):
         super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        if KExpertsCPU.CPU_INFER is None:
+            KExpertsCPU.CPU_INFER = CPUInfer(Config().cpu_infer)
         #if KExpertsCPU.gguf_loader is None:
         #    KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
         self.gguf_loader = gguf_loader
diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py
index 103fc1a..5e56e72 100644
--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase):
         self.workspace = None
 
 class KLinearCPUInfer(KLinearBase):
-    CPU_INFER = CPUInfer(Config().cpu_infer)
+    CPU_INFER = None
     def __init__(
         self,
         key: str,
@@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase):
         **kwargs,
     ):
         super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
+        if KLinearCPUInfer.CPU_INFER is None:
+            KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer)
         self.has_bias = False
         self.dtype = torch.get_default_dtype()
         self.w = None