defaulting to f32 kv, and 4 threads seem to produce better results

2025-09-11 09:34:37 +00:00 · 2023-03-25 11:11:40 +08:00 · 2023-03-25 11:11:40 +08:00 · 119392f6f2
commit 119392f6f2
parent 506cd62638
3 changed files with 5 additions and 2 deletions
--- a/llama_for_kobold.py
+++ b/llama_for_kobold.py
@ -12,6 +12,7 @@ class load_model_inputs(ctypes.Structure):
    _fields_ = [("threads", ctypes.c_int),
                ("max_context_length", ctypes.c_int),
                ("batch_size", ctypes.c_int),
+                ("f16_kv", ctypes.c_bool),
                ("model_filename", ctypes.c_char_p),
                ("n_parts_overwrite", ctypes.c_int)]

@ -43,8 +44,9 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr
    inputs.model_filename = model_filename.encode("UTF-8")
    inputs.batch_size = batch_size
    inputs.max_context_length = max_context_length #initial value to use for ctx, can be overwritten
-    inputs.threads = os.cpu_count()
+    inputs.threads = 4 #seems to outperform os.cpu_count(), it's memory bottlenecked 
    inputs.n_parts_overwrite = n_parts_overwrite
+    inputs.f16_kv = False
    ret = handle.load_model(inputs)
    return ret