server: save and clear idle slots on new task (--clear-idle) (#20993)

* server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE)

* server: move idle slot KV clearing to slot release

The save "cost" is now paid by the finishing request.

* server: add --kv-clear-idle flag, enable by default

* server: skip clearing last idle slot, clear on launch

* server: test --no-kv-clear-idle flag

* server: simplify on-release clearing loop

* server: remove on-release KV clearing, keep launch-only

* cont : clean-up

* tests: update log strings after --clear-idle rename

* tests: use debug tags instead of log message matching

* test: fix Windows CI by dropping temp log file unlink

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Yes You Can Have Your Own 2026-04-03 20:02:27 +03:00 committed by GitHub
parent f1f793ad06
commit 50e0ad08fb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 187 additions and 18 deletions

View file

@ -102,6 +102,9 @@ class ServerProcess:
mmproj_url: str | None = None
media_path: str | None = None
sleep_idle_seconds: int | None = None
cache_ram: int | None = None
no_clear_idle: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False
# session variables
@ -237,6 +240,10 @@ class ServerProcess:
server_args.extend(["--media-path", self.media_path])
if self.sleep_idle_seconds is not None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.cache_ram is not None:
server_args.extend(["--cache-ram", self.cache_ram])
if self.no_clear_idle:
server_args.append("--no-clear-idle")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")
@ -249,11 +256,16 @@ class ServerProcess:
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
flags |= subprocess.CREATE_NO_WINDOW
if self.log_path:
self._log = open(self.log_path, "w")
else:
self._log = sys.stdout
self.process = subprocess.Popen(
[str(arg) for arg in [server_path, *server_args]],
creationflags=flags,
stdout=sys.stdout,
stderr=sys.stdout,
stdout=self._log,
stderr=self._log if self._log != sys.stdout else sys.stdout,
env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
)
server_instances.add(self)
@ -298,6 +310,8 @@ class ServerProcess:
except Exception as e:
print(f"Error waiting for server: {e}")
self.process = None
if hasattr(self, '_log') and self._log != sys.stdout:
self._log.close()
def make_request(
self,