Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	README.md
#	build-xcframework.sh
#	common/CMakeLists.txt
#	examples/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-metal/ggml-metal.m
#	ggml/src/ggml-metal/ggml-metal.metal
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-sycl/backend.hpp
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/vecdotq.hpp
#	scripts/compare-llama-bench.py
#	src/CMakeLists.txt
#	src/llama-model.cpp
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-opt.cpp
#	tools/llama-bench/README.md
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/README.md
#	tools/mtmd/clip.cpp
#	tools/rpc/rpc-server.cpp
#	tools/server/CMakeLists.txt
#	tools/server/README.md
This commit is contained in:
Concedo 2025-05-13 00:28:35 +08:00
commit 21e31e255b
90 changed files with 4390 additions and 1388 deletions

View file

@ -0,0 +1,59 @@
import pytest
from utils import *
import base64
import requests
server: ServerProcess
IMG_URL_0 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
IMG_URL_1 = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/91_cat.png"
response = requests.get(IMG_URL_0)
response.raise_for_status() # Raise an exception for bad status codes
IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8")
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinygemma3()
@pytest.mark.parametrize(
"prompt, image_url, success, re_content",
[
# test model is trained on CIFAR-10, but it's quite dumb due to small size
("What is this:\n", IMG_URL_0, True, "(cat)+"),
("What is this:\n", "IMG_BASE64_0", True, "(cat)+"), # exceptional, so that we don't cog up the log
("What is this:\n", IMG_URL_1, True, "(frog)+"),
("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache
("What is this:\n", "malformed", False, None),
("What is this:\n", "https://google.com/404", False, None), # non-existent image
("What is this:\n", "https://ggml.ai", False, None), # non-image data
]
)
def test_vision_chat_completion(prompt, image_url, success, re_content):
global server
server.start(timeout_seconds=60) # vision model may take longer to load due to download size
if image_url == "IMG_BASE64_0":
image_url = IMG_BASE64_0
res = server.make_request("POST", "/chat/completions", data={
"temperature": 0.0,
"top_k": 1,
"messages": [
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {
"url": image_url,
}},
]},
],
})
if success:
assert res.status_code == 200
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"])
else:
assert res.status_code != 200

View file

@ -88,6 +88,7 @@ class ServerProcess:
chat_template: str | None = None
chat_template_file: str | None = None
server_path: str | None = None
mmproj_url: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -194,6 +195,8 @@ class ServerProcess:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
server_args.extend(["--chat-template-file", self.chat_template_file])
if self.mmproj_url:
server_args.extend(["--mmproj-url", self.mmproj_url])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"tests: starting server with: {' '.join(args)}")
@ -379,6 +382,21 @@ class ServerPreset:
server.server_reranking = True
return server
@staticmethod
def tinygemma3() -> ServerProcess:
server = ServerProcess()
# mmproj is already provided by HF registry API
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
server.model_hf_file = "tinygemma3-Q8_0.gguf"
server.mmproj_url = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf"
server.model_alias = "tinygemma3"
server.n_ctx = 1024
server.n_batch = 32
server.n_slots = 2
server.n_predict = 4
server.seed = 42
return server
def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
"""