Fix for windows model unloading not releasing memory (#569)

* Add in model processes as a separate process so it can be killed when unloading to release memory on windows * Fix from Henky
2025-09-10 17:14:36 +00:00 · 2023-12-19 02:55:41 -05:00 · 2023-12-19 02:55:41 -05:00 · 6948da5a0d
commit 6948da5a0d
parent 4c274dc2fd
2 changed files with 65 additions and 26 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -13,6 +13,7 @@ import os
 import argparse
 import json, sys, http.server, time, asyncio, socket, threading
 from concurrent.futures import ThreadPoolExecutor
+import multiprocessing

 sampler_order_max = 7
 stop_token_max = 16
@ -2330,6 +2331,25 @@ def main(launch_args,start_server=True):
    else:
        print(f"Server was not started, main function complete. Idling.")

+def run_in_queue(launch_args, input_queue, output_queue):
+    main(launch_args, start_server=False)
+    output_queue.put({'command': 'complete'})
+    while True:
+        if not input_queue.empty():
+            while not input_queue.empty():
+                data = input_queue.get()
+                if data['command'] == 'generate':
+                    (args, kwargs) = data['data']
+                output_queue.put({'command': 'generated text', 'data': generate(*args, **kwargs)})
+        time.sleep(0.2)
+        
+def start_in_seperate_process(launch_args):
+    input_queue = multiprocessing.Queue()
+    output_queue = multiprocessing.Queue()
+    p = multiprocessing.Process(target=run_in_queue, args=(launch_args, input_queue, output_queue))
+    p.start()
+    return (output_queue, input_queue, p)
+
 if __name__ == '__main__':
    print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually
    # print("Python version: " + sys.version)