server: add auto-sleep after N seconds of idle (#18228)

* implement sleeping at queue level

* implement server-context suspend

* add test

* add docs

* optimization: add fast path

* make sure to free llama_init

* nits

* fix use-after-free

* allow /models to be accessed during sleeping, fix use-after-free

* don't allow accessing /models during sleep, it is not thread-safe

* fix data race on accessing props and model_meta

* small clean up

* trailing whitespace

* rm outdated comments
This commit is contained in:
Xuan-Son Nguyen 2025-12-21 02:24:42 +01:00 committed by GitHub
parent 52ab19df63
commit ddcb75dd8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 355 additions and 122 deletions

View file

@ -100,6 +100,7 @@ class ServerProcess:
server_path: str | None = None
mmproj_url: str | None = None
media_path: str | None = None
sleep_idle_seconds: int | None = None
# session variables
process: subprocess.Popen | None = None
@ -230,6 +231,8 @@ class ServerProcess:
server_args.extend(["--mmproj-url", self.mmproj_url])
if self.media_path:
server_args.extend(["--media-path", self.media_path])
if self.sleep_idle_seconds is not None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"tests: starting server with: {' '.join(args)}")