temp checkpoint

2025-09-11 17:44:38 +00:00 · 2024-11-30 11:57:28 +08:00 · 2024-11-30 11:57:28 +08:00 · 0c8939be19
commit 0c8939be19
parent e0c59486ee fee824a1a1
41 changed files with 1616 additions and 2659 deletions
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@ -34,7 +34,7 @@ let

    # server tests
    openai
-    behave
+    pytest
    prometheus-client
  ];
 in
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@ -4,10 +4,17 @@ install(TARGETS ${TARGET} RUNTIME)

 # clibs dependencies
 include_directories(deps/)
+
 add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
 target_link_libraries(${TARGET} PRIVATE xxhash)
+
 add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
 target_link_libraries(${TARGET} PRIVATE sha1)
+if (NOT MSVC)
+    # disable warnings in 3rd party code
+    target_compile_options(sha1 PRIVATE -w)
+endif()
+
 add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)

--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
@ -1 +1,2 @@
 .venv
+tmp
--- a/examples/server/tests/conftest.py
+++ b/examples/server/tests/conftest.py
@ -0,0 +1,15 @@
+import pytest
+from utils import *
+
+
+# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
+@pytest.fixture(autouse=True)
+def stop_server_after_each_test():
+    # do nothing before each test
+    yield
+    # stop all servers after each test
+    instances = set(
+        server_instances
+    )  # copy the set to prevent 'Set changed size during iteration'
+    for server in instances:
+        server.stop()
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@ -1,66 +0,0 @@
-@llama.cpp
-@ctx_shift
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-
-    # the prompt is 301 tokens
-    # the slot context is 256/2 = 128 tokens
-    # the prompt is truncated to keep the last 109 tokens
-    # 64 tokens are generated thanks to shifting the context when it gets full
-  Scenario: Inference with context shift
-    And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-  Scenario Outline: Inference without context shift
-    And   <n_predict> server max tokens to predict
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Hi how are you
-    """
-    And   a completion request with no api error
-    Then  <n_token_output> tokens are predicted matching twind|Anna
-    And   the completion is <truncated> truncated
-    And   8 prompt tokens are processed
-    Examples:
-      | n_predict | n_token_output | truncated |
-      | 64        | 64             | not       |
-      | -1        | 120            |           |
-
-  Scenario: Inference without context shift (expected error: prompt too long)
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with 400 api error
-
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -1,113 +0,0 @@
-@llama.cpp
-@embeddings
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file bert-bge-small.gguf
-    And   a model alias bert-bge-small
-    And   42 as server seed
-    And   2 slots
-    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
-    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   128 as batch size
-    And   128 as ubatch size
-    And   512 KV cache size
-    And   enable embeddings endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Embedding
-    When embeddings are computed for:
-    """
-    What is the capital of Bulgaria ?
-    """
-    Then embeddings are generated
-
-  Scenario: Embedding (error: prompt too long)
-    When embeddings are computed for:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And  embeddings request with 500 api error
-
-  Scenario: OAI Embeddings compatibility
-    Given a model bert-bge-small
-    When an OAI compatible embeddings computation request for:
-    """
-    What is the capital of Spain ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility with multiple inputs
-    Given a model bert-bge-small
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    When an OAI compatible embeddings computation request for multiple inputs
-    Then embeddings are generated
-
-  Scenario: Multi users embeddings
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: Multi users OAI compatibility embeddings
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    And a prompt:
-      """
-      What is the biggest US city ?
-      """
-    And a prompt:
-      """
-      What is the capital of Bulgaria ?
-      """
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: All embeddings should be the same
-    Given 10 fixed prompts
-    And   a model bert-bge-small
-    Given concurrent OAI embedding requests
-    Then all embeddings are the same
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@ -1,71 +0,0 @@
-import os
-import signal
-import socket
-import sys
-import time
-import traceback
-from contextlib import closing
-from subprocess import TimeoutExpired
-
-
-def before_scenario(context, scenario):
-    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
-    if context.debug:
-        print("DEBUG=ON")
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
-    port = 8080
-    if 'PORT' in os.environ:
-        port = int(os.environ['PORT'])
-    if is_server_listening("localhost", port):
-        assert False, "Server already started"
-
-
-def after_scenario(context, scenario):
-    try:
-        if 'server_process' not in context or context.server_process is None:
-            return
-        if scenario.status == "failed":
-            if 'GITHUB_ACTIONS' in os.environ:
-                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
-                if os.path.isfile('llama.log'):
-                    with closing(open('llama.log', 'r')) as f:
-                        for line in f:
-                            print(line)
-            if not is_server_listening(context.server_fqdn, context.server_port):
-                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
-
-        if context.server_process.poll() is not None:
-            assert False, f"Server not running pid={context.server_process.pid} ..."
-
-        server_graceful_shutdown(context)  # SIGINT
-
-        try:
-            context.server_process.wait(0.5)
-        except TimeoutExpired:
-            print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
-            context.server_process.kill()  # SIGKILL
-            context.server_process.wait()
-
-        while is_server_listening(context.server_fqdn, context.server_port):
-            time.sleep(0.1)
-    except Exception:
-        print("ignoring error in after_scenario:")
-        traceback.print_exc(file=sys.stdout)
-
-
-def server_graceful_shutdown(context):
-    print(f"shutting down server pid={context.server_process.pid} ...")
-    if os.name == 'nt':
-        interrupt = signal.CTRL_C_EVENT
-    else:
-        interrupt = signal.SIGINT
-    context.server_process.send_signal(interrupt)
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        _is_server_listening = result == 0
-        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...")
-        return _is_server_listening
--- a/examples/server/tests/features/infill.feature
+++ b/examples/server/tests/features/infill.feature
@ -1,36 +0,0 @@
-@llama.cpp
-@infill
-Feature: llama.cpp server
-
-  # The current model is made by adding FIM tokens to the existing stories260K
-  # We may want to use a better model in the future, maybe something like SmolLM 360M
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models
-    And   a model file test-model-infill.gguf
-    And   a model alias tinyllama-infill
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Infill without input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra none none
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching One|day|she|saw|big|scary|bird
-
-  Scenario: Infill with input_extra
-    Given a prompt "Complete this"
-    And   an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n"
-    And   an infill input prefix "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_"
-    And   an infill input suffix "}\n"
-    And   an infill request with no api error
-    Then  64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room"
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@ -1,5 +0,0 @@
-# List of ongoing issues
-# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
-@bug
-Feature: Issues
-  # No confirmed issue at the moment
--- a/examples/server/tests/features/lora.feature
+++ b/examples/server/tests/features/lora.feature
@ -1,36 +0,0 @@
-@llama.cpp
-@lora
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
-    And   a model file stories15M_MOE-F16.gguf
-    And   a model alias stories15M_MOE
-    And   a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
-    And   42 as server seed
-    And   1024 as batch size
-    And   1024 as ubatch size
-    And   2048 KV cache size
-    And   64 max tokens to predict
-    And   0.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Completion LoRA disabled
-    Given switch off lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching little|girl|three|years|old
-
-  Scenario: Completion LoRA enabled
-    Given switch on lora adapter 0
-    Given a prompt:
-    """
-    Look in thy glass
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching eye|love|glass|sun
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -1,131 +0,0 @@
-@llama.cpp
-@parallel
-Feature: Parallel
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   42 as server seed
-    And   128 as batch size
-    And   256 KV cache size
-    And   2 slots
-    And   continuous batching
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Multi users completion
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And <n_predict> max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | n_predict |
-      | 128       |
-
-  Scenario Outline: Multi users OAI completions compatibility
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users OAI completions compatibility no v1
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests no v1
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users with number of prompts exceeding number of slots
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And a prompt:
-      """
-      What is LLM?
-      """
-    And a prompt:
-      """
-      The sky is blue and I love it.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    And 128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@ -1,56 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags passkey
-@passkey
-@slow
-Feature: Passkey / Self-extend with context shift
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-
-  # Generates a long text of junk and inserts a secret passkey number inside it.
-  # Then we query the LLM for the secret passkey.
-  # see #3856 and #4810
-  Scenario Outline: Passkey
-    Given a model file <hf_file> from HF repo <hf_repo>
-    And   <n_batch> as batch size
-    And   <n_junk> as number of junk
-    And   <n_predicted> server max tokens to predict
-    And   42 as seed
-    And   0.0 temperature
-    And   <n_ctx> KV cache size
-    And   1 slots
-    And   <n_ga> group attention factor to extend context size through self-extend
-    And   <n_ga_w> group attention width to extend context size through self-extend
-    # Can be override with N_GPU_LAYERS
-    And   <ngl> GPU offloaded layers
-    Then  the server is starting
-    # Higher timeout because the model may need to be downloaded from the internet
-    Then  the server is healthy with timeout 120 seconds
-    Given available models
-    Then  model 0 is trained on <n_ctx_train> tokens context
-    Given a prefix prompt:
-    """
-    here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
-    """
-    And a passkey prompt template:
-    """
-    The pass key is <passkey> Remember it. <passkey> is the pass key.
-    """
-    And a junk suffix prompt:
-    """
-    The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
-    """
-    And a suffix prompt:
-    """
-    What is the pass key? The pass key is
-    """
-    Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
-    And  a completion request with no api error
-    Then <n_predicted> tokens are predicted matching <re_content>
-
-    Examples:
-      | hf_repo                         | hf_file                     | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content     |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 4    | 512    | 250    | 50    | 42      | 1           | 42             |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 2    | 512    | 250    | 50    | 42      | 1           | \b((?!42)\w)+\b  |
-      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
-      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
-      # 987           |
--- a/examples/server/tests/features/rerank.feature
+++ b/examples/server/tests/features/rerank.feature
@ -1,42 +0,0 @@
-@llama.cpp
-@rerank
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
-    And   a model file jina-reranker-v1-tiny-en.gguf
-    And   a model alias jina-reranker-v1-tiny-en
-    And   42 as server seed
-    And   2 slots
-    And   512 as batch size
-    And   512 as ubatch size
-    And   512 KV cache size
-    And   enable reranking endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Rerank
-    Given a rerank query:
-      """
-      Machine learning is
-      """
-    And   a rerank document:
-      """
-      A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
-      """
-    And   a rerank document:
-      """
-      Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
-      """
-    And   a rerank document:
-      """
-      Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
-      """
-    And   a rerank document:
-      """
-      Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
-      """
-    When  reranking request
-    Then  reranking results are returned
-    Then  reranking highest score is index 2 and lowest score is index 3
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@ -1,118 +0,0 @@
-@llama.cpp
-@results
-Feature: Results
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
-    And   128 as batch size
-    And   1024 KV cache size
-    And   128 max tokens to predict
-    And   continuous batching
-
-  Scenario Outline: consistent results with same seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are equal
-    Examples:
-      | n_slots |
-      | 1       |
-      # FIXME: unified KV cache nondeterminism
-      # | 2       |
-
-  Scenario Outline: different results with different seed
-    Given <n_slots> slots
-    And   1.0 temperature
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
-    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
-
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all predictions are different
-    Examples:
-      | n_slots |
-      | 1       |
-      | 2       |
-
-  Scenario Outline: consistent results with same seed and varying batch size
-    Given 4 slots
-    And   <temp> temperature
-    # And   0 as draft
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "Write a very long story about AI." with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all predictions are equal
-    Examples:
-      | n_parallel | temp |
-      | 1          | 0.0  |
-      | 1          | 1.0  |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 2          | 0.0  |
-      # | 4          | 0.0  |
-      # | 2          | 1.0  |
-      # | 4          | 1.0  |
-
-  Scenario Outline: consistent token probs with same seed and prompt
-    Given <n_slots> slots
-    And   <n_kv> KV cache size
-    And   1.0 temperature
-    And   <n_predict> max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given 1 prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then  the server is idle
-    And   all slots are idle
-
-    Given <n_parallel> prompts "The meaning of life is" with seed 42
-    And   concurrent completion requests
-    # Then the server is busy # Not all slots will be utilized.
-    Then the server is idle
-    And  all slots are idle
-
-    Then all token probabilities are equal
-    Examples:
-      | n_slots | n_kv | n_predict | n_parallel |
-      | 4       | 1024 | 1         | 1          |
-      # FIXME: unified KV cache nondeterminism
-      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # | 4       | 1024 | 1         | 4          |
-      # | 4       | 1024 | 100       | 1          |
-      # This test still fails even the above patches; the first token probabilities are already different.
-      # | 4       | 1024 | 100       | 4          |
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@ -1,68 +0,0 @@
-@llama.cpp
-@security
-Feature: Security
-
-  Background: Server startup with an api key defined
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key THIS_IS_THE_KEY
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Completion with some user api key
-    Given a prompt test
-    And   a user api key <api_key>
-    And   4 max tokens to predict
-    And   a completion request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackeme         | raised    |
-      |                 | raised    |
-
-  Scenario Outline: OAI Compatibility
-    Given a system prompt test
-    And   a user prompt test
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    And   a user api key <api_key>
-    Given an OAI compatible chat completions request with <api_error> api error
-
-    Examples: Prompts
-      | api_key         | api_error |
-      | THIS_IS_THE_KEY | no        |
-      | THIS_IS_THE_KEY | no        |
-      | hackme          | raised    |
-
-  Scenario Outline: OAI Compatibility (invalid response formats)
-    Given a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    Given an OAI compatible chat completions request with raised api error
-
-    Examples: Prompts
-      | response_format                                       |
-      | {"type": "sound"}                                     |
-      | {"type": "json_object", "schema": 123}                |
-      | {"type": "json_object", "schema": {"type": 123}}      |
-      | {"type": "json_object", "schema": {"type": "hiccup"}} |
-
-
-  Scenario Outline: CORS Options
-    Given a user api key THIS_IS_THE_KEY
-    When  an OPTIONS request is sent from <origin>
-    Then  CORS header <cors_header> is set to <cors_header_value>
-
-    Examples: Headers
-      | origin          | cors_header                      | cors_header_value |
-      | localhost       | Access-Control-Allow-Origin      | localhost         |
-      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
-      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | GET, POST         |
-      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -1,120 +0,0 @@
-@llama.cpp
-@server
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-      # KV Cache corresponds to the total amount of tokens
-      # that can be stored across all independent sequences: #4130
-      # see --ctx-size and #5568
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-    And   64 server max tokens to predict
-    And   prometheus compatible metrics exposed
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Health
-    Then the server is ready
-    And  all slots are idle
-
-
-  Scenario Outline: Completion
-    Given a prompt <prompt>
-    And   <n_predict> max tokens to predict
-    And   a completion request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   the completion is <truncated> truncated
-    And   <n_prompt> prompt tokens are processed
-    And   prometheus metrics are exposed
-    And   metric llamacpp:tokens_predicted is <n_predicted>
-
-    Examples: Prompts
-      | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |
-
-  Scenario: Completion prompt truncated
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-
-  Scenario Outline: OAI Compatibility
-    Given a model <model>
-    And   a system prompt <system_prompt>
-    And   a user prompt <user_prompt>
-    And   <max_tokens> max tokens to predict
-    And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   <n_prompt> prompt tokens are processed
-    And   the completion is <truncated> truncated
-
-    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
-
-
-  Scenario Outline: OAI Compatibility w/ response format
-    Given a model test
-    And   a system prompt test
-    And   a user prompt test
-    And   a response format <response_format>
-    And   10 max tokens to predict
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-
-    Examples: Prompts
-      | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
-      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
-      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
-
-
-  Scenario: Tokenize / Detokenize
-    When tokenizing:
-    """
-    What is the capital of France ?
-    """
-    Then tokens can be detokenized
-    And  tokens do not begin with BOS
-
-  Scenario: Tokenize w/ BOS
-    Given adding special tokens
-    When  tokenizing:
-    """
-    What is the capital of Germany?
-    """
-    Then  tokens begin with BOS
-    Given first token is removed
-    Then  tokens can be detokenized
-
-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
-  Scenario: Models available
-    Given available models
-    Then  1 models are supported
-    Then  model 0 is identified by tinyllama-2
-    Then  model 0 is trained on 128 tokens context
--- a/examples/server/tests/features/slotsave.feature
+++ b/examples/server/tests/features/slotsave.feature
@ -1,58 +0,0 @@
-@llama.cpp
-@slotsave
-Feature: llama.cpp server slot management
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   prompt caching is enabled
-    And   2 slots
-    And   . as slot save path
-    And   2048 KV cache size
-    And   42 as server seed
-    And   24 max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Save and Restore Slot
-    # First prompt in slot 1 should be fully processed
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is saved with filename "slot1.bin"
-    Then  the server responds with status code 200
-    # Since we have cache, this should only process the last tokens
-    Given a user prompt "What is the capital of Germany?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   7 prompt tokens are processed
-    # Loading the original cache into slot 0,
-    # we should only be processing 1 prompt token and get the same output
-    When  the slot 0 is restored with filename "slot1.bin"
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 0
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   1 prompt tokens are processed
-    # For verification that slot 1 was not corrupted during slot 0 load, same thing
-    Given a user prompt "What is the capital of Germany?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
-    And   1 prompt tokens are processed
-
-  Scenario: Erase Slot
-    Given a user prompt "What is the capital of France?"
-    And   using slot id 1
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
-    When  the slot 1 is erased
-    Then  the server responds with status code 200
-    Given a user prompt "What is the capital of France?"
-    And   a completion request with no api error
-    Then  24 tokens are predicted matching (Lily|cake)
-    And   22 prompt tokens are processed
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@ -1,25 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags wrong_usage
-@wrong_usage
-Feature: Wrong usage of llama.cpp server
-
-  #3969 The user must always set --n-predict option
-  # to cap the number of tokens any completion request can generate
-  # or pass n_predict/max_tokens in the request.
-  Scenario: Infinite loop
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   42 as server seed
-    And   2048 KV cache size
-    # Uncomment below to fix the issue
-    #And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-      """
-      Go to: infinite loop
-      """
-    # Uncomment below to fix the issue
-    #And   128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is idle
-    Then all prompts are predicted
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,5 +1,5 @@
 aiohttp~=3.9.3
-behave~=1.2.6
+pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
 openai~=1.30.3
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -4,8 +4,7 @@ set -eu

 if [ $# -lt 1 ]
 then
-    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    pytest -v -x
 else
-    behave "$@"
+    pytest "$@"
 fi
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
@ -0,0 +1,34 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_start_simple():
+    global server
+    server.start()
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_server_props():
+    global server
+    server.start()
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["total_slots"] == server.n_slots
+
+
+def test_server_models():
+    global server
+    server.start()
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == server.model_alias
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -0,0 +1,129 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+    })
+    assert res.status_code == 200
+    assert res.body["usage"]["prompt_tokens"] == n_prompt
+    assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    if truncated:
+        assert choice["finish_reason"] == "length"
+    else:
+        assert choice["finish_reason"] == "stop"
+
+
+@pytest.mark.parametrize(
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    [
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+    ]
+)
+def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "model": model,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        choice = data["choices"][0]
+        if choice["finish_reason"] in ["stop", "length"]:
+            assert data["usage"]["prompt_tokens"] == n_prompt
+            assert data["usage"]["completion_tokens"] == n_predicted
+            assert "content" not in choice["delta"]
+            assert match_regex(re_content, content)
+            # FIXME: not sure why this is incorrect in stream mode
+            # if truncated:
+            #   assert choice["finish_reason"] == "length"
+            # else:
+            #   assert choice["finish_reason"] == "stop"
+        else:
+            assert choice["finish_reason"] is None
+            content += choice["delta"]["content"]
+
+
+def test_chat_completion_with_openai_library():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        messages=[
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+        max_tokens=8,
+        seed=42,
+        temperature=0.8,
+    )
+    print(res)
+    assert res.choices[0].finish_reason == "stop"
+    assert res.choices[0].message.content is not None
+    assert match_regex("(Suddenly)+", res.choices[0].message.content)
+
+
+@pytest.mark.parametrize("response_format,n_predicted,re_content", [
+    ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
+    ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
+    ({"type": "json_object"}, 10, "(\\{|John)+"),
+    ({"type": "sound"}, 0, None),
+    # invalid response format (expected to fail)
+    ({"type": "json_object", "schema": 123}, 0, None),
+    ({"type": "json_object", "schema": {"type": 123}}, 0, None),
+    ({"type": "json_object", "schema": {"type": "hiccup"}}, 0, None),
+])
+def test_completion_with_response_format(response_format: dict, n_predicted: int, re_content: str | None):
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": n_predicted,
+        "messages": [
+            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "user", "content": "Write an example"},
+        ],
+        "response_format": response_format,
+    })
+    if re_content is not None:
+        assert res.status_code == 200
+        choice = res.body["choices"][0]
+        assert match_regex(re_content, choice["message"]["content"])
+    else:
+        assert res.status_code != 200
+        assert "error" in res.body
+
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@ -0,0 +1,223 @@
+import pytest
+import time
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == n_prompt
+    assert res.body["timings"]["predicted_n"] == n_predicted
+    assert res.body["truncated"] == truncated
+    assert match_regex(re_content, res.body["content"])
+
+
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
+    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
+    ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+])
+def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": prompt,
+        "stream": True,
+    })
+    content = ""
+    for data in res:
+        if data["stop"]:
+            assert data["timings"]["prompt_n"] == n_prompt
+            assert data["timings"]["predicted_n"] == n_predicted
+            assert data["truncated"] == truncated
+            assert match_regex(re_content, content)
+        else:
+            content += data["content"]
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_consistent_result_same_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_slots", [1, 2])
+def test_different_result_different_seed(n_slots: int):
+    global server
+    server.n_slots = n_slots
+    server.start()
+    last_res = None
+    for seed in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": seed,
+            "temperature": 1.0,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] != last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.parametrize("n_batch", [16, 32])
+@pytest.mark.parametrize("temperature", [0.0, 1.0])
+def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
+    global server
+    server.n_batch = n_batch
+    server.start()
+    last_res = None
+    for _ in range(4):
+        res = server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+            "seed": 42,
+            "temperature": temperature,
+            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
+        })
+        if last_res is not None:
+            assert res.body["content"] == last_res.body["content"]
+        last_res = res
+
+
+@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
+def test_cache_vs_nocache_prompt():
+    global server
+    server.start()
+    res_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": True,
+    })
+    res_no_cache = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "seed": 42,
+        "temperature": 1.0,
+        "cache_prompt": False,
+    })
+    assert res_cache.body["content"] == res_no_cache.body["content"]
+
+
+def test_completion_with_tokens_input():
+    global server
+    server.temperature = 0.0
+    server.start()
+    prompt_str = "I believe the meaning of life is"
+    res = server.make_request("POST", "/tokenize", data={
+        "content": prompt_str,
+        "add_special": True,
+    })
+    assert res.status_code == 200
+    tokens = res.body["tokens"]
+
+    # single completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": tokens,
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+    # batch completion
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, tokens],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [tokens, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body) == list
+    assert len(res.body) == 2
+    assert res.body[0]["content"] == res.body[1]["content"]
+
+    # mixed string and tokens in one sequence
+    res = server.make_request("POST", "/completion", data={
+        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
+    })
+    assert res.status_code == 200
+    assert type(res.body["content"]) == str
+
+
+@pytest.mark.parametrize("n_slots,n_requests", [
+    (1, 3),
+    (2, 2),
+    (2, 4),
+    (4, 2), # some slots must be idle
+    (4, 6),
+])
+def test_completion_parallel_slots(n_slots: int, n_requests: int):
+    global server
+    server.n_slots = n_slots
+    server.temperature = 0.0
+    server.start()
+
+    PROMPTS = [
+        ("Write a very long book.", "(very|special|big)+"),
+        ("Write another a poem.", "(small|house)+"),
+        ("What is LLM?", "(Dad|said)+"),
+        ("The sky is blue and I love it.", "(climb|leaf)+"),
+        ("Write another very long music lyrics.", "(friends|step|sky)+"),
+        ("Write a very long joke.", "(cat|Whiskers)+"),
+    ]
+    def check_slots_status():
+        should_all_slots_busy = n_requests >= n_slots
+        time.sleep(0.1)
+        res = server.make_request("GET", "/slots")
+        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
+        if should_all_slots_busy:
+            assert n_busy == n_slots
+        else:
+            assert n_busy <= n_slots
+
+    tasks = []
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        tasks.append((server.make_request, ("POST", "/completion", {
+            "prompt": prompt,
+            "seed": 42,
+            "temperature": 1.0,
+        })))
+    tasks.append((check_slots_status, ()))
+    results = parallel_function_calls(tasks)
+
+    # check results
+    for i in range(n_requests):
+        prompt, re_content = PROMPTS[i % len(PROMPTS)]
+        res = results[i]
+        assert res.status_code == 200
+        assert type(res.body["content"]) == str
+        assert len(res.body["content"]) > 10
+        # FIXME: the result is not deterministic when using other slot than slot 0
+        # assert match_regex(re_content, res.body["content"])
--- a/examples/server/tests/unit/test_ctx_shift.py
+++ b/examples/server/tests/unit/test_ctx_shift.py
@ -0,0 +1,67 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+LONG_TEXT = """
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+""".strip()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.n_ctx = 256
+    server.n_slots = 2
+
+
+def test_ctx_shift_enabled():
+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["prompt_n"] == 109
+    assert res.body["timings"]["predicted_n"] == 64
+    assert res.body["truncated"] is True
+
+
+@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
+    (64, 64, False),
+    (-1, 120, True),
+])
+def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
+    global server
+    server.disable_ctx_shift = True
+    server.n_predict = -1
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": n_predict,
+        "prompt": "Hi how are you",
+    })
+    assert res.status_code == 200
+    assert res.body["timings"]["predicted_n"] == n_token_output
+    assert res.body["truncated"] == truncated
+
+
+def test_ctx_shift_disabled_long_prompt():
+    global server
+    server.disable_ctx_shift = True
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 64,
+        "prompt": LONG_TEXT,
+    })
+    assert res.status_code != 200
+    assert "error" in res.body
+    assert "exceeds the available context size" in res.body["error"]["message"]
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@ -0,0 +1,99 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.bert_bge_small()
+
+EPSILON = 1e-3
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.bert_bge_small()
+
+
+def test_embedding_single():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_multiple():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI from a very long prompt which will not be truncated",
+            "This is a test",
+            "This is another test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
+def test_embedding_openai_library_single():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is")
+    assert len(res.data) == 1
+    assert len(res.data[0].embedding) > 1
+
+
+def test_embedding_openai_library_multiple():
+    global server
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.embeddings.create(model="text-embedding-3-small", input=[
+        "I believe the meaning of life is",
+        "Write a joke about AI from a very long prompt which will not be truncated",
+        "This is a test",
+        "This is another test",
+    ])
+    assert len(res.data) == 4
+    for d in res.data:
+        assert len(d.embedding) > 1
+
+
+def test_embedding_error_prompt_too_long():
+    global server
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": "This is a test " * 512,
+    })
+    assert res.status_code != 200
+    assert "too large" in res.body["error"]["message"]
+
+
+def test_same_prompt_give_same_result():
+    server.start()
+    res = server.make_request("POST", "/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+            "I believe the meaning of life is",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 5
+    for i in range(1, len(res.body['data'])):
+        v0 = res.body['data'][0]['embedding']
+        vi = res.body['data'][i]['embedding']
+        for x, y in zip(v0, vi):
+            assert abs(x - y) < EPSILON
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@ -0,0 +1,35 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama_infill()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama_infill()
+
+def test_infill_without_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
+
+def test_infill_with_input_extra():
+    global server
+    server.start()
+    res = server.make_request("POST", "/infill", data={
+        "prompt": "Complete this",
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
--- a/examples/server/tests/unit/test_lora.py
+++ b/examples/server/tests/unit/test_lora.py
@ -0,0 +1,42 @@
+import pytest
+import os
+from utils import *
+
+server = ServerPreset.stories15m_moe()
+
+LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.stories15m_moe()
+    # download lora file if needed
+    file_name = LORA_FILE_URL.split('/').pop()
+    lora_file = f'../../../{file_name}'
+    if not os.path.exists(lora_file):
+        print(f"Downloading {LORA_FILE_URL} to {lora_file}")
+        with open(lora_file, 'wb') as f:
+            f.write(requests.get(LORA_FILE_URL).content)
+        print(f"Done downloading lora file")
+    server.lora_files = [lora_file]
+
+
+@pytest.mark.parametrize("scale,re_content", [
+    # without applying lora, the model should behave like a bedtime story generator
+    (0.0, "(little|girl|three|years|old)+"),
+    # with lora, the model should behave like a Shakespearean text generator
+    (1.0, "(eye|love|glass|sun)+"),
+])
+def test_lora(scale: float, re_content: str):
+    global server
+    server.start()
+    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": scale}
+    ])
+    assert res_lora_control.status_code == 200
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+    })
+    assert res.status_code == 200
+    assert match_regex(re_content, res.body["content"])
+
--- a/examples/server/tests/unit/test_rerank.py
+++ b/examples/server/tests/unit/test_rerank.py
@ -0,0 +1,38 @@
+import pytest
+from utils import *
+
+server = ServerPreset.jina_reranker_tiny()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.jina_reranker_tiny()
+
+
+def test_rerank():
+    global server
+    server.start()
+    res = server.make_request("POST", "/rerank", data={
+        "query": "Machine learning is",
+        "documents": [
+            "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.",
+            "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.",
+            "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
+            "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine."
+        ]
+    })
+    assert res.status_code == 200
+    assert len(res.body["results"]) == 4
+
+    most_relevant = res.body["results"][0]
+    least_relevant = res.body["results"][0]
+    for doc in res.body["results"]:
+        if doc["relevance_score"] > most_relevant["relevance_score"]:
+            most_relevant = doc
+        if doc["relevance_score"] < least_relevant["relevance_score"]:
+            least_relevant = doc
+
+    assert most_relevant["relevance_score"] > least_relevant["relevance_score"]
+    assert most_relevant["index"] == 2
+    assert least_relevant["index"] == 3
--- a/examples/server/tests/unit/test_security.py
+++ b/examples/server/tests/unit/test_security.py
@ -0,0 +1,83 @@
+import pytest
+from openai import OpenAI
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+TEST_API_KEY = "sk-this-is-the-secret-key"
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.api_key = TEST_API_KEY
+
+
+@pytest.mark.parametrize("endpoint", ["/health", "/models"])
+def test_access_public_endpoint(endpoint: str):
+    global server
+    server.start()
+    res = server.make_request("GET", endpoint)
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+
+@pytest.mark.parametrize("api_key", [None, "invalid-key"])
+def test_incorrect_api_key(api_key: str):
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {api_key}" if api_key else None,
+    })
+    assert res.status_code == 401
+    assert "error" in res.body
+    assert res.body["error"]["type"] == "authentication_error"
+
+
+def test_correct_api_key():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completions", data={
+        "prompt": "I believe the meaning of life is",
+    }, headers={
+        "Authorization": f"Bearer {TEST_API_KEY}",
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+    assert "content" in res.body
+
+
+def test_openai_library_correct_api_key():
+    global server
+    server.start()
+    client = OpenAI(api_key=TEST_API_KEY, base_url=f"http://{server.server_host}:{server.server_port}")
+    res = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a chatbot."},
+            {"role": "user", "content": "What is the meaning of life?"},
+        ],
+    )
+    assert len(res.choices) == 1
+
+
+@pytest.mark.parametrize("origin,cors_header,cors_header_value", [
+    ("localhost", "Access-Control-Allow-Origin", "localhost"),
+    ("web.mydomain.fr", "Access-Control-Allow-Origin", "web.mydomain.fr"),
+    ("origin", "Access-Control-Allow-Credentials", "true"),
+    ("web.mydomain.fr", "Access-Control-Allow-Methods", "GET, POST"),
+    ("web.mydomain.fr", "Access-Control-Allow-Headers", "*"),
+])
+def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
+    global server
+    server.start()
+    res = server.make_request("OPTIONS", "/completions", headers={
+        "Origin": origin,
+        "Access-Control-Request-Method": "POST",
+        "Access-Control-Request-Headers": "Authorization",
+    })
+    assert res.status_code == 200
+    assert cors_header in res.headers
+    assert res.headers[cors_header] == cors_header_value
--- a/examples/server/tests/unit/test_slot_save.py
+++ b/examples/server/tests/unit/test_slot_save.py
@ -0,0 +1,98 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+    server.slot_save_path = "./tmp"
+    server.temperature = 0.0
+
+
+def test_slot_save_restore():
+    global server
+    server.start()
+
+    # First prompt in slot 1 should be fully processed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # Save state of slot 1
+    res = server.make_request("POST", "/slots/1?action=save", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_saved"] == 84
+
+    # Since we have cache, this should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # Loading the saved cache into slot 0
+    res = server.make_request("POST", "/slots/0?action=restore", data={
+        "filename": "slot1.bin",
+    })
+    assert res.status_code == 200
+    assert res.body["n_restored"] == 84
+
+    # Since we have cache, slot 0 should only process the last tokens
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 6  # only different part is processed
+
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing should work
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of Germany?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Jack|said)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 1
+
+
+def test_slot_erase():
+    global server
+    server.start()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
+
+    # erase slot 1
+    res = server.make_request("POST", "/slots/1?action=erase")
+    assert res.status_code == 200
+
+    # re-run the same prompt, it should process all tokens again
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "What is the capital of France?",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert match_regex("(Whiskers|Flana)+", res.body["content"])
+    assert res.body["timings"]["prompt_n"] == 21  # all tokens are processed
--- a/examples/server/tests/unit/test_tokenize.py
+++ b/examples/server/tests/unit/test_tokenize.py
@ -0,0 +1,59 @@
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_tokenize_detokenize():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content
+    })
+    assert res_tok.status_code == 200
+    assert len(res_tok.body["tokens"]) > 5
+    # detokenize
+    res_detok = server.make_request("POST", "/detokenize", data={
+        "tokens": res_tok.body["tokens"],
+    })
+    assert res_detok.status_code == 200
+    assert res_detok.body["content"].strip() == content
+
+
+def test_tokenize_with_bos():
+    global server
+    server.start()
+    # tokenize
+    content = "What is the capital of France ?"
+    bosId = 1
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "add_special": True,
+    })
+    assert res_tok.status_code == 200
+    assert res_tok.body["tokens"][0] == bosId
+
+
+def test_tokenize_with_pieces():
+    global server
+    server.start()
+    # tokenize
+    content = "This is a test string with unicode 媽 and emoji 🤗"
+    res_tok = server.make_request("POST", "/tokenize", data={
+        "content": content,
+        "with_pieces": True,
+    })
+    assert res_tok.status_code == 200
+    for token in res_tok.body["tokens"]:
+        assert "id" in token
+        assert token["id"] > 0
+        assert "piece" in token
+        assert len(token["piece"]) > 0
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# type: ignore[reportUnusedImport]
+
+import subprocess
+import os
+import re
+import json
+import sys
+import threading
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Tuple,
+    Set,
+)
+from re import RegexFlag
+
+
+class ServerResponse:
+    headers: dict
+    status_code: int
+    body: dict | Any
+
+
+class ServerProcess:
+    # default options
+    debug: bool = False
+    server_port: int = 8080
+    server_host: str = "127.0.0.1"
+    model_hf_repo: str = "ggml-org/models"
+    model_hf_file: str = "tinyllamas/stories260K.gguf"
+    model_alias: str = "tinyllama-2"
+    temperature: float = 0.8
+    seed: int = 42
+
+    # custom options
+    model_alias: str | None = None
+    model_url: str | None = None
+    model_file: str | None = None
+    n_threads: int | None = None
+    n_gpu_layer: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_ctx: int | None = None
+    n_ga: int | None = None
+    n_ga_w: int | None = None
+    n_predict: int | None = None
+    n_prompts: int | None = 0
+    slot_save_path: str | None = None
+    id_slot: int | None = None
+    cache_prompt: bool | None = None
+    n_slots: int | None = None
+    server_continuous_batching: bool | None = False
+    server_embeddings: bool | None = False
+    server_reranking: bool | None = False
+    server_metrics: bool | None = False
+    draft: int | None = None
+    api_key: str | None = None
+    response_format: str | None = None
+    lora_files: List[str] | None = None
+    disable_ctx_shift: int | None = False
+
+    # session variables
+    process: subprocess.Popen | None = None
+
+    def __init__(self):
+        if "N_GPU_LAYERS" in os.environ:
+            self.n_gpu_layer = int(os.environ["N_GPU_LAYERS"])
+        if "DEBUG" in os.environ:
+            self.debug = True
+        if "PORT" in os.environ:
+            self.server_port = int(os.environ["PORT"])
+
+    def start(self, timeout_seconds: int = 10) -> None:
+        if "LLAMA_SERVER_BIN_PATH" in os.environ:
+            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
+        elif os.name == "nt":
+            server_path = "../../../build/bin/Release/llama-server.exe"
+        else:
+            server_path = "../../../build/bin/llama-server"
+        server_args = [
+            "--slots",  # requires to get slot status via /slots endpoint
+            "--host",
+            self.server_host,
+            "--port",
+            self.server_port,
+            "--temp",
+            self.temperature,
+            "--seed",
+            self.seed,
+        ]
+        if self.model_file:
+            server_args.extend(["--model", self.model_file])
+        if self.model_url:
+            server_args.extend(["--model-url", self.model_url])
+        if self.model_hf_repo:
+            server_args.extend(["--hf-repo", self.model_hf_repo])
+        if self.model_hf_file:
+            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.n_batch:
+            server_args.extend(["--batch-size", self.n_batch])
+        if self.n_ubatch:
+            server_args.extend(["--ubatch-size", self.n_ubatch])
+        if self.n_threads:
+            server_args.extend(["--threads", self.n_threads])
+        if self.n_gpu_layer:
+            server_args.extend(["--n-gpu-layers", self.n_gpu_layer])
+        if self.draft is not None:
+            server_args.extend(["--draft", self.draft])
+        if self.server_continuous_batching:
+            server_args.append("--cont-batching")
+        if self.server_embeddings:
+            server_args.append("--embedding")
+        if self.server_reranking:
+            server_args.append("--reranking")
+        if self.server_metrics:
+            server_args.append("--metrics")
+        if self.model_alias:
+            server_args.extend(["--alias", self.model_alias])
+        if self.n_ctx:
+            server_args.extend(["--ctx-size", self.n_ctx])
+        if self.n_slots:
+            server_args.extend(["--parallel", self.n_slots])
+        if self.n_predict:
+            server_args.extend(["--n-predict", self.n_predict])
+        if self.slot_save_path:
+            server_args.extend(["--slot-save-path", self.slot_save_path])
+        if self.n_ga:
+            server_args.extend(["--grp-attn-n", self.n_ga])
+        if self.n_ga_w:
+            server_args.extend(["--grp-attn-w", self.n_ga_w])
+        if self.debug:
+            server_args.append("--verbose")
+        if self.lora_files:
+            for lora_file in self.lora_files:
+                server_args.extend(["--lora", lora_file])
+        if self.disable_ctx_shift:
+            server_args.extend(["--no-context-shift"])
+        if self.api_key:
+            server_args.extend(["--api-key", self.api_key])
+
+        args = [str(arg) for arg in [server_path, *server_args]]
+        print(f"bench: starting server with: {' '.join(args)}")
+
+        flags = 0
+        if "nt" == os.name:
+            flags |= subprocess.DETACHED_PROCESS
+            flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+            flags |= subprocess.CREATE_NO_WINDOW
+
+        self.process = subprocess.Popen(
+            [str(arg) for arg in [server_path, *server_args]],
+            creationflags=flags,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env={**os.environ, "LLAMA_CACHE": "tmp"},
+        )
+        server_instances.add(self)
+
+        def server_log(in_stream, out_stream):
+            for line in iter(in_stream.readline, b""):
+                print(line.decode("utf-8"), end="", file=out_stream)
+
+        thread_stdout = threading.Thread(
+            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
+        )
+        thread_stdout.start()
+
+        thread_stderr = threading.Thread(
+            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
+        )
+        thread_stderr.start()
+
+        print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
+
+        # wait for server to start
+        start_time = time.time()
+        while time.time() - start_time < timeout_seconds:
+            try:
+                response = self.make_request("GET", "/slots", headers={
+                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
+                })
+                if response.status_code == 200:
+                    self.ready = True
+                    return  # server is ready
+            except Exception as e:
+                pass
+            print(f"Waiting for server to start...")
+            time.sleep(0.5)
+        raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
+
+    def stop(self) -> None:
+        server_instances.remove(self)
+        if self.process:
+            print(f"Stopping server with pid={self.process.pid}")
+            self.process.kill()
+            self.process = None
+
+    def make_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | Any | None = None,
+        headers: dict | None = None,
+    ) -> ServerResponse:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        parse_body = False
+        if method == "GET":
+            response = requests.get(url, headers=headers)
+            parse_body = True
+        elif method == "POST":
+            response = requests.post(url, headers=headers, json=data)
+            parse_body = True
+        elif method == "OPTIONS":
+            response = requests.options(url, headers=headers)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        result = ServerResponse()
+        result.headers = dict(response.headers)
+        result.status_code = response.status_code
+        result.body = response.json() if parse_body else None
+        print("Response from server", result.body)
+        return result
+
+    def make_stream_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+    ) -> Iterator[dict]:
+        url = f"http://{self.server_host}:{self.server_port}{path}"
+        if method == "POST":
+            response = requests.post(url, headers=headers, json=data, stream=True)
+        else:
+            raise ValueError(f"Unimplemented method: {method}")
+        for line_bytes in response.iter_lines():
+            line = line_bytes.decode("utf-8")
+            if '[DONE]' in line:
+                break
+            elif line.startswith('data: '):
+                data = json.loads(line[6:])
+                print("Partial response from server", data)
+                yield data
+
+
+server_instances: Set[ServerProcess] = set()
+
+
+class ServerPreset:
+    @staticmethod
+    def tinyllama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K.gguf"
+        server.model_alias = "tinyllama-2"
+        server.n_ctx = 256
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 64
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def bert_bge_small() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 512
+        server.n_batch = 128
+        server.n_ubatch = 128
+        server.n_slots = 2
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
+    @staticmethod
+    def tinyllama_infill() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
+        server.model_alias = "tinyllama-infill"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def stories15m_moe() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/stories15M_MOE"
+        server.model_hf_file = "stories15M_MOE-F16.gguf"
+        server.model_alias = "stories15m-moe"
+        server.n_ctx = 2048
+        server.n_batch = 1024
+        server.n_slots = 1
+        server.n_predict = 64
+        server.temperature = 0.0
+        server.seed = 42
+        return server
+
+    @staticmethod
+    def jina_reranker_tiny() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
+        server.model_alias = "jina-reranker"
+        server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
+        server.n_ctx = 512
+        server.n_batch = 512
+        server.n_slots = 1
+        server.seed = 42
+        server.server_reranking = True
+        return server
+
+
+def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
+    """
+    Run multiple functions in parallel and return results in the same order as calls. Equivalent to Promise.all in JS.
+
+    Example usage:
+
+    results = parallel_function_calls([
+        (func1, (arg1, arg2)),
+        (func2, (arg3, arg4)),
+    ])
+    """
+    results = [None] * len(function_list)
+    exceptions = []
+
+    def worker(index, func, args):
+        try:
+            result = func(*args)
+            results[index] = result
+        except Exception as e:
+            exceptions.append((index, str(e)))
+
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i, (func, args) in enumerate(function_list):
+            future = executor.submit(worker, i, func, args)
+            futures.append(future)
+
+        # Wait for all futures to complete
+        for future in as_completed(futures):
+            pass
+
+    # Check if there were any exceptions
+    if exceptions:
+        print("Exceptions occurred:")
+        for index, error in exceptions:
+            print(f"Function at index {index}: {error}")
+
+    return results
+
+
+def match_regex(regex: str, text: str) -> bool:
+    return (
+        re.compile(
+            regex, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL
+        ).search(text)
+        is not None
+    )
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -14,7 +14,7 @@
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE

-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1232,8 +1232,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
    std::cerr << "ggml_vulkan: Compiling shaders";

    // mulmat
-    std::vector<uint32_t> l_warptile, m_warptile, s_warptile, l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
-    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms;
+    std::vector<uint32_t> l_warptile, m_warptile, s_warptile,
+                          l_warptile_mmq, m_warptile_mmq, s_warptile_mmq;
+    std::array<uint32_t, 3> l_wg_denoms, m_wg_denoms, s_wg_denoms,
+                            l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms;
    uint32_t l_align, m_align, s_align;

    l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
@ -1244,14 +1246,48 @@ static void ggml_vk_load_shaders(vk_device& device) {
    m_warptile_mmq = { 128,  64,  64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
    s_warptile_mmq = { std::max(device->subgroup_size, 16u),  32,  32, 32, 32, 32, 2, 2, 2, device->subgroup_size };

-    l_wg_denoms = {128, 128, 1 };
-    m_wg_denoms = { 64,  64, 1 };
-    s_wg_denoms = { 32,  32, 1 };
+    l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
+    m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
+    s_mmq_wg_denoms = s_wg_denoms = { 32,  32, 1 };

    l_align = 128;
    m_align =  64;
    s_align =  32;

+    // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders
+    // and tile sizes, this should handle 16KB, 32KB, and 48KB+.
+    // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders.
+    // But the numbers happen to work out for 32KB shared memory size that when using the medium
+    // size there's enough room for everything, and we assert for this.
+    uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        l_warptile = m_warptile;
+        l_wg_denoms = m_wg_denoms;
+        shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
+    shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+    if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
+        if (device->properties.limits.maxComputeSharedMemorySize == 32768) {
+            l_warptile_mmq = m_warptile_mmq;
+            l_mmq_wg_denoms = m_mmq_wg_denoms;
+        } else {
+            l_warptile_mmq = s_warptile_mmq;
+            l_mmq_wg_denoms = s_mmq_wg_denoms;
+        }
+        shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
+        GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+    if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+        // assert mul_mat_mat_id shaders will fit.
+        GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
+    }
+
    device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
    device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();

@ -1299,35 +1335,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
        CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
        CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);

-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);

-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
    } else {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
@ -1344,35 +1383,38 @@ static void ggml_vk_load_shaders(vk_device& device) {
        CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);
        CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);
+        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3);

-        CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
-        CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
+        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
+            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);
+            CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4);

-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);

-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
-        CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4);
+        }
 #undef CREATE_MM
    }

@ -6541,6 +6583,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
+                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+                if (op->op == GGML_OP_MUL_MAT_ID &&
+                    ggml_vk_get_device(ctx->device)->properties.limits.maxComputeSharedMemorySize < 32768) {
+                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
+                    return false;
+                }
                switch (op->src[0]->type) {
                    case GGML_TYPE_F32:
                    case GGML_TYPE_F16:
@ -7157,7 +7205,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
        const int32_t max_period = tensor->op_params[1];
        tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
    } else if (tensor->op == GGML_OP_POOL_2D) {
-        enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
+        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
        const int32_t k0 = tensor->op_params[1];
        const int32_t k1 = tensor->op_params[2];
        const int32_t s0 = tensor->op_params[3];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp
@ -19,7 +19,7 @@ void main() {

    const uint tid = gl_LocalInvocationID.x;
    const uint start = gl_WorkGroupID.x * group_size + tid;
-    const uint end = start + group_size;
+    const uint end = (gl_WorkGroupID.x + 1) * group_size;

    tmp[tid] = 0.0f;

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@ -52,13 +52,16 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
 #endif

 #ifndef MUL_MAT_ID
-    const uint i13 = batch_idx / p.ne12;
-    const uint i12 = batch_idx % p.ne12;
+    uint batch_idx_a = 0;
+    if (batch_idx != 0) {
+        const uint i13 = batch_idx / p.ne12;
+        const uint i12 = batch_idx % p.ne12;

-    const uint i03 = i13 / p.broadcast3;
-    const uint i02 = i12 / p.broadcast2;
+        const uint i03 = i13 / p.broadcast3;
+        const uint i02 = i12 / p.broadcast2;

-    const uint batch_idx_a = i03 * p.ne02 + i02;
+        batch_idx_a = i03 * p.ne02 + i02;
+    }
 #else
    const uint expert_id = data_ids[expert_idx];
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require

 #include "mul_mat_vec_base.comp"

@ -32,38 +33,67 @@ void main() {
    const uint s_offset = 8*v_im;
    const uint y_offset = 128*v_im + l0;

-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp

    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const uint y_idx = i * QUANT_K + y_offset;

-        const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
-        const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
+        f16vec2 d = data_a[ib0 + i].d;
+        const FLOAT_TYPE dall = d.x;
+        const FLOAT_TYPE dmin = d.y;
+
+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
+        uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
+
+        uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
+        uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
+        uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
+        uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
+
+        uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
+        uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
+        uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
+        uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
+
+        uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
+        uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
+        uvec2 qs0 =  uvec2(unpack8(qs0_u16));
+        uvec2 qs16 = uvec2(unpack8(qs16_u16));

        FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
        FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
-            sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
-                   fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
+        [[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
+            sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
+                   fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
+                   fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
+                   fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
+                   fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
+                   fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
+                   fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
+                   fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
        }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
+        temp = fma(dall, sum1, fma(-dmin, sum2, temp));
    }

+    tmp[gl_LocalInvocationID.x] = temp;
+
    // sum up partial sums and write back result
    barrier();
    [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@ -1,4 +1,5 @@
 #version 450
+#extension GL_EXT_shader_explicit_arithmetic_types : require

 #include "mul_mat_vec_base.comp"

@ -33,7 +34,7 @@ void main() {
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 128*v_im + l0;

-    tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp

    const uint s_shift = 4 * v_im;

@ -42,21 +43,44 @@ void main() {

        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);

+        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
+        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
+        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
+        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
+        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
+        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
+        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
+        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
+
+        uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
+        uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
+        uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
+        uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
+        uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
+        uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
+        u8vec2 s0 = unpack8(s0_16);
+        u8vec2 s2 = unpack8(s2_16);
+        u8vec2 s4 = unpack8(s4_16);
+        u8vec2 s6 = unpack8(s6_16);
+        u8vec2 s8 = unpack8(s8_16);
+        u8vec2 s10 = unpack8(s10_16);
+
        FLOAT_TYPE sum = FLOAT_TYPE(0.0);
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +  0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
-                  fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+            sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
+                  fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
        }
-        const uint tmp_idx = 16 * ix + tid;
-        tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
+        temp = fma(d, sum, temp);
    }

+    tmp[gl_LocalInvocationID.x] = temp;
+
    // sum up partial sums and write back result
    barrier();
    [[unroll]] for (uint s = 16; s > 0; s >>= 1) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@ -34,9 +34,6 @@ void main() {
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 64*v_im + l0;

-    const uint8_t hm1 = uint8_t(1 << (2*v_im));
-    const uint8_t hm2 = uint8_t(hm1 << 4);
-
    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp

    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
@ -71,6 +68,18 @@ void main() {
        uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
        uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;

+        uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+
+        uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+        uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+        uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
+        uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+
+        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
+        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
+        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
+        qs64_80_u32_hi4 += qs64_80_hi4_offset16;
+
        uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
        uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
        uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
@ -102,31 +111,26 @@ void main() {
        B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
        B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];

-        uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2];
-        uint32_t qh1 = qh0 >> 8;
-        uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8];
-        uint32_t qh17 = qh16 >> 8;
-
        const FLOAT_TYPE sx =
-          fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by10.x), q4_0,
+          fma(FLOAT_TYPE(by10.y), q4_1,
+          fma(FLOAT_TYPE(by116.x), q4_2,
+             FLOAT_TYPE(by116.y) * q4_3)));
        const FLOAT_TYPE sy =
-          fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by132.x), q4_4,
+          fma(FLOAT_TYPE(by132.y), q4_5,
+          fma(FLOAT_TYPE(by148.x), q4_6,
+             FLOAT_TYPE(by148.y) * q4_7)));
        const FLOAT_TYPE sz =
-          fma(FLOAT_TYPE(by20.x), (q4_8  + (((qh0 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by20.y), (q4_9  + (((qh1 & hm2) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by20.x), q4_8,
+          fma(FLOAT_TYPE(by20.y), q4_9,
+          fma(FLOAT_TYPE(by216.x), q4_10,
+             FLOAT_TYPE(by216.y) * q4_11)));
        const FLOAT_TYPE sw =
-          fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)),
-          fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)),
-             FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0)))));
+          fma(FLOAT_TYPE(by232.x), q4_12,
+          fma(FLOAT_TYPE(by232.y), q4_13,
+          fma(FLOAT_TYPE(by248.x), q4_14,
+             FLOAT_TYPE(by248.y) * q4_15)));
        const FLOAT_TYPE smin =
          fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
          fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@ -30,10 +30,8 @@
 #endif
 #endif

-#if defined(DATA_A_Q4_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_0 32
+#define QUANT_R_Q4_0 2

 struct block_q4_0
 {
@ -46,14 +44,15 @@ struct block_q4_0_packed16
    uint16_t qs[16/2];
 };

+#if defined(DATA_A_Q4_0)
+#define QUANT_K QUANT_K_Q4_0
+#define QUANT_R QUANT_R_Q4_0
 #define A_TYPE block_q4_0
 #define A_TYPE_PACKED16 block_q4_0_packed16
 #endif

-#if defined(DATA_A_Q4_1)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q4_1 32
+#define QUANT_R_Q4_1 2

 struct block_q4_1
 {
@ -69,15 +68,15 @@ struct block_q4_1_packed16
    uint16_t qs[16/2];
 };

+#if defined(DATA_A_Q4_1)
+#define QUANT_K QUANT_K_Q4_1
+#define QUANT_R QUANT_R_Q4_1
 #define A_TYPE block_q4_1
 #define A_TYPE_PACKED16 block_q4_1_packed16
 #endif

-#if defined(DATA_A_Q5_0)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_0 32
+#define QUANT_R_Q5_0 2

 struct block_q5_0
 {
@ -93,15 +92,15 @@ struct block_q5_0_packed16
    uint16_t qs[16/2];
 };

+#if defined(DATA_A_Q5_0)
+#define QUANT_K QUANT_K_Q5_0
+#define QUANT_R QUANT_R_Q5_0
 #define A_TYPE block_q5_0
 #define A_TYPE_PACKED16 block_q5_0_packed16
 #endif

-#if defined(DATA_A_Q5_1)
-#extension GL_EXT_shader_16bit_storage : require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_Q5_1 32
+#define QUANT_R_Q5_1 2

 struct block_q5_1
 {
@ -119,14 +118,15 @@ struct block_q5_1_packed16
    uint16_t qs[16/2];
 };

+#if defined(DATA_A_Q5_1)
+#define QUANT_K QUANT_K_Q5_1
+#define QUANT_R QUANT_R_Q5_1
 #define A_TYPE block_q5_1
 #define A_TYPE_PACKED16 block_q5_1_packed16
 #endif

-#if defined(DATA_A_Q8_0)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 1
+#define QUANT_K_Q8_0 32
+#define QUANT_R_Q8_0 1

 struct block_q8_0
 {
@ -139,164 +139,164 @@ struct block_q8_0_packed16
    uint16_t qs[32/2];
 };

+#if defined(DATA_A_Q8_0)
+#define QUANT_K QUANT_K_Q8_0
+#define QUANT_R QUANT_R_Q8_0
 #define A_TYPE block_q8_0
 #define A_TYPE_PACKED16 block_q8_0_packed16
 #endif

 // K-quants
-#if defined(DATA_A_Q2_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q2_K 256

 struct block_q2_K
 {
-    uint8_t scales[QUANT_K/16];
-    uint8_t qs[QUANT_K/4];
+    uint8_t scales[QUANT_K_Q2_K/16];
+    uint8_t qs[QUANT_K_Q2_K/4];
    f16vec2 d;
 };

 struct block_q2_K_packed16
 {
-    uint16_t scales[QUANT_K/16/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t scales[QUANT_K_Q2_K/16/2];
+    uint16_t qs[QUANT_K_Q2_K/4/2];
    f16vec2 d;
 };

 struct block_q2_K_packed32
 {
-    uint32_t scales[QUANT_K/16/4];
-    uint32_t qs[QUANT_K/4/4];
+    uint32_t scales[QUANT_K_Q2_K/16/4];
+    uint32_t qs[QUANT_K_Q2_K/4/4];
    f16vec2 d;
 };

+#if defined(DATA_A_Q2_K)
+#define QUANT_K QUANT_K_Q2_K
 #define A_TYPE block_q2_K
 #define A_TYPE_PACKED16 block_q2_K_packed16
 #define A_TYPE_PACKED32 block_q2_K_packed32
 #endif

-#if defined(DATA_A_Q3_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q3_K 256

 struct block_q3_K
 {
-    uint8_t hmask[QUANT_K/8];
-    uint8_t qs[QUANT_K/4];
+    uint8_t hmask[QUANT_K_Q3_K/8];
+    uint8_t qs[QUANT_K_Q3_K/4];
    uint8_t scales[12];
    float16_t d;
 };

 struct block_q3_K_packed16
 {
-    uint16_t hmask[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/4/2];
+    uint16_t hmask[QUANT_K_Q3_K/8/2];
+    uint16_t qs[QUANT_K_Q3_K/4/2];
    uint16_t scales[12/2];
    float16_t d;
 };

+#if defined(DATA_A_Q3_K)
+#define QUANT_K QUANT_K_Q3_K
 #define A_TYPE block_q3_K
 #define A_TYPE_PACKED16 block_q3_K_packed16
 #endif

-#if defined(DATA_A_Q4_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q4_K 256

 struct block_q4_K
 {
    f16vec2 d;
-    uint8_t scales[3*QUANT_K/64];
-    uint8_t qs[QUANT_K/2];
+    uint8_t scales[3*QUANT_K_Q4_K/64];
+    uint8_t qs[QUANT_K_Q4_K/2];
 };

 struct block_q4_K_packed16
 {
    f16vec2 d;
-    uint16_t scales[3*QUANT_K/64/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t scales[3*QUANT_K_Q4_K/64/2];
+    uint16_t qs[QUANT_K_Q4_K/2/2];
 };

 struct block_q4_K_packed32
 {
    f16vec2 d;
-    uint32_t scales[3*QUANT_K/64/4];
-    uint32_t qs[QUANT_K/2/4];
+    uint32_t scales[3*QUANT_K_Q4_K/64/4];
+    uint32_t qs[QUANT_K_Q4_K/2/4];
 };

+#if defined(DATA_A_Q4_K)
+#define QUANT_K QUANT_K_Q4_K
 #define A_TYPE block_q4_K
 #define A_TYPE_PACKED16 block_q4_K_packed16
 #define A_TYPE_PACKED32 block_q4_K_packed32
 #endif

-#if defined(DATA_A_Q5_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q5_K 256

 struct block_q5_K
 {
    f16vec2 d;
    uint8_t scales[12];
-    uint8_t qh[QUANT_K/8];
-    uint8_t qs[QUANT_K/2];
+    uint8_t qh[QUANT_K_Q5_K/8];
+    uint8_t qs[QUANT_K_Q5_K/2];
 };

 struct block_q5_K_packed16
 {
    f16vec2 d;
    uint16_t scales[12/2];
-    uint16_t qh[QUANT_K/8/2];
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qh[QUANT_K_Q5_K/8/2];
+    uint16_t qs[QUANT_K_Q5_K/2/2];
 };

+#if defined(DATA_A_Q5_K)
+#define QUANT_K QUANT_K_Q5_K
 #define A_TYPE block_q5_K
 #define A_TYPE_PACKED16 block_q5_K_packed16
 #endif

-#if defined(DATA_A_Q6_K)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 256
+#define QUANT_K_Q6_K 256

 struct block_q6_K
 {
-    uint8_t ql[QUANT_K/2];
-    uint8_t qh[QUANT_K/4];
-    int8_t scales[QUANT_K/16];
+    uint8_t ql[QUANT_K_Q6_K/2];
+    uint8_t qh[QUANT_K_Q6_K/4];
+    int8_t scales[QUANT_K_Q6_K/16];
    float16_t d;
 };

 struct block_q6_K_packed16
 {
-    uint16_t ql[QUANT_K/2/2];
-    uint16_t qh[QUANT_K/4/2];
-    int8_t scales[QUANT_K/16];
+    uint16_t ql[QUANT_K_Q6_K/2/2];
+    uint16_t qh[QUANT_K_Q6_K/4/2];
+    int8_t scales[QUANT_K_Q6_K/16];
    float16_t d;
 };

+#if defined(DATA_A_Q6_K)
+#define QUANT_K QUANT_K_Q6_K
 #define A_TYPE block_q6_K
 #define A_TYPE_PACKED16 block_q6_K_packed16
 #endif

 // IQuants

-#if defined(DATA_A_IQ4_NL)
-#extension GL_EXT_shader_16bit_storage : require
-#define QUANT_K 32
-#define QUANT_R 2
+#define QUANT_K_IQ4_NL 32
+#define QUANT_R_IQ4_NL 2

 struct block_iq4_nl
 {
    float16_t d;
-    uint8_t qs[QUANT_K/2];
+    uint8_t qs[QUANT_K_IQ4_NL/2];
 };

 struct block_iq4_nl_packed16
 {
    float16_t d;
-    uint16_t qs[QUANT_K/2/2];
+    uint16_t qs[QUANT_K_IQ4_NL/2/2];
 };

-#define A_TYPE block_iq4_nl
-#define A_TYPE_PACKED16 block_iq4_nl_packed16
+#if defined(DATA_A_IQ4_NL)

 const int8_t kvalues_iq4nl_const[16] = {
    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
@ -313,6 +313,11 @@ void init_iq4nl_shmem()
    }
    barrier();
 }
+
+#define QUANT_K QUANT_K_IQ4_NL
+#define QUANT_R QUANT_R_IQ4_NL
+#define A_TYPE block_iq4_nl
+#define A_TYPE_PACKED16 block_iq4_nl_packed16
 #endif

 #endif // !defined(GGML_TYPES_COMP)