diff --git a/Makefile b/Makefile
index 9e46d13f8..6d31892c0 100644
--- a/Makefile
+++ b/Makefile
@@ -769,25 +769,25 @@ clean:
 	rm -vrf llguidance
 
 # useful tools
-main: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+main: tools/completion/completion.cpp common/arg.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-mainvk: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
+mainvk: tools/completion/completion.cpp common/arg.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-fitparams: tools/fit-params/fit-params.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
+fitparams: tools/fit-params/fit-params.cpp common/arg.cpp common/preset.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/version.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-ttsmain: tools/tts/tts.cpp common/arg.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+ttsmain: tools/tts/tts.cpp common/arg.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf-split: tools/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/arg.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+mtmd-cli: tools/mtmd/mtmd-cli.cpp tools/mtmd/mtmd.cpp tools/mtmd/mtmd-helper.cpp tools/mtmd/clip.cpp common/arg.cpp common/preset.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-embedding: examples/embedding/embedding.cpp common/arg.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+embedding: examples/embedding/embedding.cpp common/arg.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-embeddingvk: examples/embedding/embedding.cpp common/arg.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
+embeddingvk: examples/embedding/embedding.cpp common/arg.cpp common/preset.cpp common/download.cpp src/llama-cparams.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/ttscpp.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/common/arg.cpp b/common/arg.cpp
index c808b6649..e9eb538c0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -51,6 +51,7 @@
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
 extern const char * LICENSES[];
+const char * LICENSES[] = {};
 
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;
diff --git a/docs/android/imported-into-android-studio.jpg b/docs/android/imported-into-android-studio.jpg
deleted file mode 100644
index bbe6867c6..000000000
Binary files a/docs/android/imported-into-android-studio.jpg and /dev/null differ
diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh
deleted file mode 100755
index da9df110a..000000000
--- a/scripts/snapdragon/adb/run-completion.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/sh
-#
-
-# Basedir on device
-basedir=/data/local/tmp/llama.cpp
-
-cli_opts=
-
-branch=.
-[ "$B" != "" ] && branch=$B
-
-adbserial=
-[ "$S" != "" ] && adbserial="-s $S"
-
-adbhost=
-[ "$H" != "" ] && adbhost="-H $H"
-
-model="Llama-3.2-3B-Instruct-Q4_0.gguf"
-[ "$M" != "" ] && model="$M"
-
-device="HTP0"
-[ "$D" != "" ] && device="$D"
-
-experimental=
-[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
-
-verbose=
-[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
-
-sched=
-[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
-
-profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
-
-opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
-
-nhvx=
-[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
-
-ndev=
-[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
-
-hb=
-[ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB"
-
-set -x
-
-adb $adbserial $adbhost shell " \
-  cd $basedir; ulimit -c unlimited;        \
-    LD_LIBRARY_PATH=$basedir/$branch/lib   \
-    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
-    $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb        \
-      ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
-         --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1                  \
-         --ctx-size 8192 --batch-size 128 -fa on \
-         -ngl 99 -no-cnv --device $device $cli_opts $@   \
-"
diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp
deleted file mode 100644
index c10bde91b..000000000
--- a/tests/test-backend-sampler.cpp
+++ /dev/null
@@ -1,1165 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "llama-cpp.h"
-#include "get-model.h"
-#include "common.h"
-
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-struct test_args {
-    std::string model;
-    std::string test;
-    std::string device = "auto";
-};
-
-struct test_params {
-    llama_model_ptr model;
-};
-
-static llama_model_ptr load_model(const test_args & args) {
-    auto mparams = llama_model_default_params();
-
-    ggml_backend_dev_t devs[2] = { nullptr, nullptr };
-
-    if (args.device != "auto") {
-        if (args.device == "gpu") {
-            devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-
-            if (devs[0] == nullptr) {
-                fprintf(stderr, "Error: GPU requested but not available\n");
-                return nullptr;
-            }
-
-            mparams.n_gpu_layers = 999;
-        } else if (args.device == "cpu") {
-            devs[0] = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-
-            mparams.n_gpu_layers = 0;
-        } else {
-            fprintf(stderr, "Error: invalid device '%s'\n", args.device.c_str());
-            return nullptr;
-        }
-
-        mparams.devices = devs;
-
-        fprintf(stderr, "Using device: %s\n", ggml_backend_dev_name(devs[0]));
-    }
-
-    llama_model_ptr res;
-
-    res.reset(llama_model_load_from_file(args.model.c_str(), mparams));
-
-    if (!res) {
-        fprintf(stderr, "Warning: failed to load model '%s', skipping test\n", args.model.c_str());
-        return nullptr;
-    }
-
-    return res;
-}
-
-struct test_context {
-    llama_context_ptr ctx;
-
-    int n_vocab = 0;
-
-    const llama_vocab * vocab = nullptr;
-
-    std::unordered_map<llama_seq_id, int32_t> seq_positions;
-    std::unordered_map<llama_seq_id, int32_t> last_batch_info;
-
-    test_context(const test_params & params, std::vector<llama_sampler_seq_config> & configs, int32_t n_seq_max = -1) {
-        auto * model = params.model.get();
-
-        GGML_ASSERT(model);
-        GGML_ASSERT(!ctx);
-
-        llama_context_params cparams = llama_context_default_params();
-        cparams.n_ctx = 512;
-        cparams.n_batch = 512;
-        cparams.samplers = configs.data();
-        cparams.n_samplers = configs.size();
-
-        // If n_seq_max is not specified, calculate it from configs
-        if (n_seq_max < 0) {
-            int32_t max_seq_id = 0;
-            for (const auto & config : configs) {
-                max_seq_id = std::max(config.seq_id, max_seq_id);
-            }
-            cparams.n_seq_max = max_seq_id + 1;
-        } else {
-            cparams.n_seq_max = n_seq_max;
-        }
-
-        ctx.reset(llama_init_from_model(model, cparams));
-        if (!ctx) {
-            throw std::runtime_error("failed to create context");
-        }
-
-        llama_set_warmup(ctx.get(), false);
-
-        vocab = llama_model_get_vocab(model);
-        n_vocab = llama_vocab_n_tokens(vocab);
-    }
-
-    bool decode(const std::map<llama_seq_id, std::string> & prompts) {
-        GGML_ASSERT(ctx);
-
-        last_batch_info.clear();
-        llama_batch batch = llama_batch_init(512, 0, prompts.size());
-
-        for (const auto & [seq_id, prompt] : prompts) {
-            std::vector<llama_token> tokens;
-            tokens.push_back(llama_vocab_bos(vocab));
-
-            std::vector<llama_token> prompt_tokens(32);
-            int n_tokens = llama_tokenize(vocab, prompt.c_str(), prompt.length(),
-                                           prompt_tokens.data(), prompt_tokens.size(),
-                                           false, false);
-            if (n_tokens < 0) {
-                fprintf(stderr, "Warning: tokenization failed for seq_id %d\n", seq_id);
-                llama_batch_free(batch);
-                return false;
-            }
-
-            for (int i = 0; i < n_tokens; i++) {
-                tokens.push_back(prompt_tokens[i]);
-            }
-
-            if (seq_positions.find(seq_id) == seq_positions.end()) {
-                seq_positions[seq_id] = 0;
-            }
-
-            int32_t start_pos = seq_positions[seq_id];
-            for (size_t i = 0; i < tokens.size(); i++) {
-                common_batch_add(batch, tokens[i], start_pos + i, { seq_id }, i == tokens.size() - 1);
-            }
-
-            seq_positions[seq_id] = start_pos + tokens.size();
-        }
-
-
-        printf("Batch contents:\n");
-        printf("n_tokens: %d\n", batch.n_tokens);
-        for (int i = 0; i < batch.n_tokens; i++) {
-            printf("token[%d]: tok=%-5d, pos=%d, n_seq_id=%d, seq_ids=[", i, batch.token[i], batch.pos[i], batch.n_seq_id[i]);
-
-            for (int j = 0; j < batch.n_seq_id[i]; j++) {
-                printf("%d%s", batch.seq_id[i][j], j < batch.n_seq_id[i]-1 ? ", " : "");
-            }
-            printf("], logits=%d\n", batch.logits[i]);
-        }
-
-        if (llama_decode(ctx.get(), batch) != 0) {
-            fprintf(stderr, "Warning: llama_decode failed\n");
-            llama_batch_free(batch);
-            return false;
-        }
-
-        // Build mapping from seq id to batch token idx
-        for (int i = 0; i < batch.n_tokens; i++) {
-            if (batch.logits[i]) {
-                llama_seq_id seq_id = batch.seq_id[i][0];
-                last_batch_info[seq_id] = i;
-            }
-        }
-
-        llama_batch_free(batch);
-        return true;
-    }
-
-    int32_t idx_for_seq(llama_seq_id seq_id) {
-        auto it = last_batch_info.find(seq_id);
-        if (it == last_batch_info.end()) {
-            fprintf(stderr, "Error: no batch index found for seq_id %d\n", seq_id);
-            return -1;
-        }
-        return it->second;
-    }
-
-    void update_batch_info(const llama_batch & batch) {
-        last_batch_info.clear();
-        for (int i = 0; i < batch.n_tokens; i++) {
-            if (batch.logits[i]) {
-                llama_seq_id cur_seq = batch.seq_id[i][0];
-                last_batch_info[cur_seq] = i;
-            }
-        }
-    }
-
-    bool decode_token(llama_token token, llama_seq_id seq_id = 0) {
-        GGML_ASSERT(ctx);
-
-        llama_batch batch = llama_batch_init(1, 0, 1);
-        int32_t pos = seq_positions[seq_id];
-        common_batch_add(batch, token, pos, { seq_id }, true);
-
-        if (llama_decode(ctx.get(), batch) != 0) {
-            fprintf(stderr, "Warning: llama_decode failed for token %d in seq %d\n", token, seq_id);
-            llama_batch_free(batch);
-            return false;
-        }
-
-        update_batch_info(batch);
-
-        seq_positions[seq_id]++;
-        llama_batch_free(batch);
-
-        return true;
-    }
-
-    bool decode_tokens(const std::map<llama_seq_id, llama_token> & seq_tokens) {
-        GGML_ASSERT(ctx);
-
-        llama_batch batch = llama_batch_init(seq_tokens.size(), 0, seq_tokens.size());
-
-        for (const auto & [seq_id, token] : seq_tokens) {
-            int32_t pos = seq_positions[seq_id];
-            common_batch_add(batch, token, pos, { seq_id }, true);
-        }
-
-        if (llama_decode(ctx.get(), batch) != 0) {
-            fprintf(stderr, "Warning: llama_decode failed for batch tokens\n");
-            llama_batch_free(batch);
-            return false;
-        }
-
-        for (const auto & [seq_id, _] : seq_tokens) {
-            seq_positions[seq_id]++;
-        }
-
-        update_batch_info(batch);
-
-        llama_batch_free(batch);
-
-        return true;
-    }
-
-    std::string token_to_piece(llama_token token, bool special) const {
-        std::string piece;
-        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-        const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
-        if (n_chars < 0) {
-            piece.resize(-n_chars);
-            int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
-            GGML_ASSERT(check == -n_chars);
-        } else {
-            piece.resize(n_chars);
-        }
-
-        return piece;
-    }
-};
-
-static void test_backend_greedy_sampling(const test_params & params) {
-    const int seq_id = 0;
-
-    struct llama_sampler_chain_params backend_sampler_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_sampler_params));
-
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_greedy());
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Some"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
-    printf("greedy sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    for (int i = 0; i < 10; i++) {
-        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), loop_idx);
-        printf("Generation step %d: token id:%d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
-        if (!test_ctx.decode_token(token, 0)) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-    }
-}
-
-static void test_backend_top_k_sampling(const test_params & params) {
-    const int seq_id = 0;
-    const int32_t k = 8;
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_k(k));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Hello"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
-    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-    for (size_t i = 0; i < n_logits; ++i) {
-        printf("top_k logit[%zu] = %.6f\n", i, logits[i]);
-    }
-
-    llama_token * candidates = llama_get_sampled_candidates_ith(test_ctx.ctx.get(), batch_idx);
-    uint32_t n_candidates = llama_get_sampled_candidates_count_ith(test_ctx.ctx.get(), batch_idx);
-    for (size_t i = 0; i < n_candidates; ++i) {
-        printf("top_k candidate[%zu] = %d : %s\n", i, candidates[i],
-               test_ctx.token_to_piece(candidates[i], false).c_str());
-    }
-
-    // Sample using CPU sampler for verification that it is possible to do hybrid
-    // sampling, first top_k on the backend and then dist on the CPU.
-    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-    GGML_ASSERT(chain->iface->backend_apply != nullptr);
-
-    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
-    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    printf("backend top-k hybrid sampling test PASSED\n");
-}
-
-static void test_backend_temp_sampling(const test_params & params) {
-    {
-        const float temp_0 = 0.8f;
-        struct llama_sampler_chain_params backend_chain_params_0 = llama_sampler_chain_default_params();
-        llama_sampler_ptr backend_sampler_chain_0(llama_sampler_chain_init(backend_chain_params_0));
-        llama_sampler_chain_add(backend_sampler_chain_0.get(), llama_sampler_init_temp(temp_0));
-
-        const float temp_1 = 0.1f;
-        struct llama_sampler_chain_params backend_chain_params_1 = llama_sampler_chain_default_params();
-        llama_sampler_ptr backend_sampler_chain_1(llama_sampler_chain_init(backend_chain_params_1));
-        llama_sampler_chain_add(backend_sampler_chain_1.get(), llama_sampler_init_temp(temp_1));
-
-        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-            { 0, backend_sampler_chain_0.get() },
-            { 1, backend_sampler_chain_1.get() }
-        };
-
-        test_context test_ctx(params, backend_sampler_configs);
-
-        if (!test_ctx.decode({{0, "Some where over the"}, {1, "Once upon a"}})) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-
-        // Verfify sequence 0
-        {
-            int32_t batch_idx = test_ctx.idx_for_seq(0);
-            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-            GGML_ASSERT(n_logits == test_ctx.n_vocab);
-
-            // Sample from sequence 0 using CPU sampler
-            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
-
-            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-            const std::string token_str = test_ctx.token_to_piece(token, false);
-            printf("Sequence 0 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
-            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-        }
-
-
-        // Verfify sequence 1
-        {
-            int32_t batch_idx = test_ctx.idx_for_seq(1);
-
-            // Sample from sequence 1 using CPU sampler
-            struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-            llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-            llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
-
-            llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-            const std::string token_str = test_ctx.token_to_piece(token, false);
-            printf("Sequence 1 sampled token id:%d, string: '%s'\n", token, token_str.c_str());
-            GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-        }
-    }
-
-    // lambda to testing non-positive temperature values.
-    auto test_argmax_temp = [&](float temp) {
-        printf("\nTesting temperature = %.1f\n", temp);
-
-        int seq_id = 0;
-        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp(temp));
-
-        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-            { seq_id, backend_sampler_chain.get() },
-        };
-
-        test_context test_ctx(params, backend_sampler_configs);
-
-        if (!test_ctx.decode({{seq_id, "Once"}})) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-
-        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-        GGML_ASSERT(n_logits == 1);
-    };
-
-    test_argmax_temp(0.0f);
-    test_argmax_temp(-1.0f);
-
-    printf("backend temp sampling test PASSED\n");
-}
-
-static void test_backend_temp_ext_sampling(const test_params & params) {
-    {
-        int seq_id = 0;
-        const float temp = 0.8f;
-        const float delta = 0.5f;
-        const float exponent = 1.5f;
-        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));
-
-        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-            { seq_id, backend_sampler_chain.get() },
-        };
-
-        test_context test_ctx(params, backend_sampler_configs);
-
-        if (!test_ctx.decode({{seq_id, "Once upon a"}})) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-
-        // Verify sequence 0
-        {
-            int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-            int n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-            GGML_ASSERT(n_logits == test_ctx.n_vocab);
-        }
-    }
-
-    // lambda to testing non-positive temp/delta/exponent values.
-    auto test_argmax_temp = [&](float temp, float delta, float exponent) {
-        printf("\nTesting temperature = %.1f, delta = %1.f, exponent = %1.f\n", temp, delta, exponent);
-
-        int seq_id = 0;
-        struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-        llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_temp_ext(temp, delta, exponent));
-
-        std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-            { seq_id, backend_sampler_chain.get() },
-        };
-
-        test_context test_ctx(params, backend_sampler_configs);
-
-        if (!test_ctx.decode({{seq_id, "Once"}})) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-
-        int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-        uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-
-        if (temp <= 0.0f && delta >= 0.0f) {
-            GGML_ASSERT(n_logits == 1);
-        } else {
-            GGML_ASSERT(n_logits == (uint32_t) test_ctx.n_vocab);
-        }
-    };
-
-    test_argmax_temp(0.0f,  0.3f, 1.0f); // Greedy (temp=0)
-    test_argmax_temp(-1.0f, 0.3f, 2.0f); // Greedy (temp<0)
-    test_argmax_temp(0.8f,  0.0f, 2.0f); // Temperature scaling
-
-    printf("backend temp_ext sampling test PASSED\n");
-}
-
-static void test_backend_min_p_sampling(const test_params & params) {
-    const int seq_id = 0;
-    const float p = 0.1;
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_min_p(p, 0));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Hello"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
-    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-
-    // Print the logits that are above the min-p threshold
-    std::vector<float> filtered_logits;
-    for (size_t i = 0; i < n_logits; ++i) {
-        if (logits[i] > -1e9f) {
-            filtered_logits.push_back(logits[i]);
-            //printf("min_p logit[%zu] = %.6f\n", i, logits[i]);
-        }
-    }
-    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);
-
-    // Sample using CPU sampler for verification to inspect they are reasonable
-    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));
-
-    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-    const std::string token_str = test_ctx.token_to_piece(token, false);
-    printf("min-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    // Decode and sampler 10 more tokens
-    for (int i = 0; i < 10; i++) {
-        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
-        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
-        printf("min-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
-        if (!test_ctx.decode_token(token, 0)) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-    }
-
-    printf("min-p sampling test PASSED\n");
-}
-
-static void test_backend_top_p_sampling(const test_params & params) {
-    const int seq_id = 0;
-    const float p = 0.9;
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_top_p(p, 0));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Hello"}})) {
-        return;
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
-    uint32_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-
-    // Print the logits that are above the min-p threshold
-    std::vector<float> filtered_logits;
-    for (size_t i = 0; i < n_logits; ++i) {
-        if (logits[i] > -1e9f) {
-            filtered_logits.push_back(logits[i]);
-        }
-    }
-    GGML_ASSERT(filtered_logits.size() < (size_t) test_ctx.n_vocab);
-    GGML_ASSERT(filtered_logits.size() > 0);
-
-    // Sample using CPU sampler for verification to inspect they are reasonable
-    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(88));
-
-    llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-    const std::string token_str = test_ctx.token_to_piece(token, false);
-    printf("top-p cpu sampled token id:%d, string: '%s'\n", token, token_str.c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    // Decode and sampler 10 more tokens
-    for (int i = 0; i < 10; i++) {
-        int32_t loop_idx = test_ctx.idx_for_seq(seq_id);
-        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), loop_idx);
-        printf("top-p gen step %d: token id :%5.d, string: %s\n", i, token, test_ctx.token_to_piece(token, false).c_str());
-        test_ctx.decode_token(token, 0);
-    }
-
-    printf("top-p sampling test PASSED\n");
-}
-
-static void test_backend_multi_sequence_sampling(const test_params & params) {
-    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
-    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
-    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_greedy());
-
-    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
-    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
-    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_temp(0.8f));
-    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_greedy());
-
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-        { 0, sampler_chain_0.get() },
-        { 1, sampler_chain_1.get() }
-    };
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    std::map<llama_seq_id, std::string> prompts = {
-        {0, "Hello"},
-        {1, "Some"}
-    };
-
-    if (!test_ctx.decode(prompts)) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    // Verfiy sequence 0
-    {
-        int32_t batch_idx = test_ctx.idx_for_seq(0);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("Seq 0 sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    }
-
-    // Verify sequence 1
-    {
-        int32_t batch_idx= test_ctx.idx_for_seq(1);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("Seq 1 sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    }
-
-    // Generate tokens for each sequence
-    printf("\nMulti-sequence generation:\n");
-    for (int step = 0; step < 4; step++) {
-        std::map<llama_seq_id, llama_token> tokens;
-
-        for (llama_seq_id seq_id : {0, 1}) {
-            int32_t idx = test_ctx.idx_for_seq(seq_id);
-            llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), idx);
-            const std::string token_str = test_ctx.token_to_piece(token, false);
-            printf("  Seq %d, step %d: token id=%d, string='%s'\n", seq_id, step, token, token_str.c_str());
-            tokens[seq_id] = token;
-        }
-
-        // Decode all tokens in a single batch
-        if (!test_ctx.decode_tokens(tokens)) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-    }
-
-    printf("backend multi-sequence sampling test PASSED\n");
-}
-
-static void test_backend_dist_sampling(const test_params & params) {
-    const int seq_id = 189;
-    const int32_t seed = 88;
-
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Some"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-    llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);
-
-    token = llama_get_sampled_token_ith(test_ctx.ctx.get(), -1);
-    printf("dist sampled id:%d, string:'%s'\n", token, test_ctx.token_to_piece(token, false).c_str());
-    GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-
-    printf("backend dist sampling test PASSED\n");
-}
-
-static void test_backend_dist_sampling_and_cpu(const test_params & params) {
-    const int seq_id = 0;
-    const int32_t seed = 88;
-
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Some"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    // Sample using CPU sampler
-    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
-
-    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-    llama_token cpu_token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-    printf("dist & cpu sampled id:%d, string:'%s'\n", cpu_token, test_ctx.token_to_piece(cpu_token, false).c_str());
-    GGML_ASSERT(backend_token == cpu_token);
-
-    printf("backend dist & cpu sampling test PASSED\n");
-}
-
-static void test_backend_logit_bias_sampling(const test_params & params) {
-    const auto * model = params.model.get();
-    const auto * vocab = llama_model_get_vocab(model);
-
-    const int seq_id = 0;
-
-    std::vector<llama_logit_bias> logit_bias;
-
-    // Get the token for the piece "World".
-    const std::string piece = "World";
-    std::vector<llama_token> tokens(16);
-    llama_tokenize(vocab, piece.c_str(), piece.size(), tokens.data(), tokens.size(), false, false);
-
-    llama_token bias_token = tokens[0];
-    // TODO: biasing too much here makes the Vulkan sampling fail - should be investigated further
-    //       https://github.com/ggml-org/llama.cpp/actions/runs/20894267644/job/60030252675?pr=18753#step:3:23350
-    //logit_bias.push_back({ bias_token, +100.0f });
-    logit_bias.push_back({ bias_token, +10.0f });
-
-    printf("biasing token piece '%s' -> token id %d\n", piece.c_str(), bias_token);
-
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
-                logit_bias.size(),
-                logit_bias.data()));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(88));
-
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-        { seq_id, backend_sampler_chain.get() },
-    };
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Hello"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
-    printf("sampled token = %d, expected = %d\n", backend_token, bias_token);
-    GGML_ASSERT(backend_token == bias_token);
-
-    printf("backend logit bias sampling test PASSED\n");
-}
-
-// This test verifies that it is possible to have two different backend sampler,
-// one that uses the backend dist sampler, and another that uses CPU dist sampler.
-static void test_backend_mixed_sampling(const test_params & params) {
-    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
-    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
-    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));
-
-    int k = 40;
-    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
-    llama_sampler_ptr sampler_chain_1(llama_sampler_chain_init(chain_params_1));
-    llama_sampler_chain_add(sampler_chain_1.get(), llama_sampler_init_top_k(k));
-
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-        { 0, sampler_chain_0.get() },
-        { 1, sampler_chain_1.get() }
-    };
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    std::map<llama_seq_id, std::string> prompts = {
-        {0, "Hello"},
-        {1, "Some"}
-    };
-
-    if (!test_ctx.decode(prompts)) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    // Verfiy sequence 0 that used the dist backend sampler.
-    {
-        int32_t batch_idx = test_ctx.idx_for_seq(0);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-        //GGML_ASSERT(llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx) == nullptr);
-        //GGML_ASSERT(llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx) == 0);
-    }
-
-    // Verfiy sequence 1 that used the top-k backend sampler.
-    {
-        int32_t batch_idx = test_ctx.idx_for_seq(1);
-        float * logits = llama_get_sampled_logits_ith(test_ctx.ctx.get(), batch_idx);
-        GGML_ASSERT(logits != nullptr);
-        size_t n_logits = llama_get_sampled_logits_count_ith(test_ctx.ctx.get(), batch_idx);
-        GGML_ASSERT(n_logits == (size_t) k);
-        GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx) == LLAMA_TOKEN_NULL);
-    }
-
-    printf("backend mixed sampling test PASSED\n");
-}
-
-static void test_backend_set_sampler(const test_params & params) {
-    const int seq_id = 0;
-    const int32_t seed = 88;
-
-    struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    if (!test_ctx.decode({{seq_id, "Hello"}})) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    int32_t batch_idx = test_ctx.idx_for_seq(seq_id);
-
-    // Sample using backend sampler configured above
-    llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-    const std::string backend_token_str = test_ctx.token_to_piece(backend_token, false);
-    printf("dist sampled token = %d, string='%s'\n", backend_token, backend_token_str.c_str());
-
-    // Now clear the backend sampler for this sequence.
-    llama_set_sampler(test_ctx.ctx.get(), seq_id, nullptr);
-    printf("Cleared backend sampler for seq_id %d\n", seq_id);
-
-    // Sample using CPU sampler
-    struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-    llama_sampler_chain_add(chain.get(), llama_sampler_init_dist(18));
-
-    std::map<llama_seq_id, llama_token> tokens = { { seq_id, backend_token}, };
-    if (!test_ctx.decode_tokens(tokens)) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    // Should not have any sampled token or probs after clearing the backend sampler.
-    const int32_t idx = test_ctx.idx_for_seq(seq_id);
-    GGML_ASSERT(llama_get_sampled_token_ith(test_ctx.ctx.get(), idx) == LLAMA_TOKEN_NULL);
-    GGML_ASSERT(llama_get_sampled_probs_ith(test_ctx.ctx.get(), idx) == nullptr);
-
-    // Sample the token using the CPU sampler chain.
-    llama_token token2 = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), seq_id);
-    const std::string token2_str = test_ctx.token_to_piece(token2, false);
-    printf("CPU sampled token after clearing backend sampler: id=%d, string='%s'\n", token2, token2_str.c_str());
-    std::map<llama_seq_id, llama_token> tokens2 = { { seq_id, token2}, };
-
-    // Set a new backend sampler for the sequence.
-    struct llama_sampler_chain_params new_backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr new_backend_sampler_chain(llama_sampler_chain_init(new_backend_chain_params));
-    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_top_k(20));
-    llama_sampler_chain_add(new_backend_sampler_chain.get(), llama_sampler_init_dist(seed));
-    llama_set_sampler(test_ctx.ctx.get(), seq_id, new_backend_sampler_chain.get());
-
-    if (!test_ctx.decode_tokens(tokens2)) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    llama_token new_backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), test_ctx.idx_for_seq(seq_id));
-    const std::string new_backend_token_str = test_ctx.token_to_piece(new_backend_token, false);
-    printf("dist sampled token = %d, string='%s'\n", new_backend_token, new_backend_token_str.c_str());
-
-    printf("backend set sampler test PASSED\n");
-}
-
-static void test_backend_cpu_mixed_batch(const test_params & params) {
-    // Sequence 0 uses backend sampling
-    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
-    llama_sampler_ptr sampler_chain_0(llama_sampler_chain_init(chain_params_0));
-    llama_sampler_chain_add(sampler_chain_0.get(), llama_sampler_init_dist(88));
-
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
-        { 0, sampler_chain_0.get() },
-    };
-
-    // We need 2 sequences: seq 0 with backend sampling, seq 1 with CPU sampling
-    test_context test_ctx(params, backend_sampler_configs, 2);
-
-    std::map<llama_seq_id, std::string> prompts = {
-        {0, "Hello"}, // Will use backend sampling
-        {1, "Some"}   // Will use CPU sampling
-    };
-
-    if (!test_ctx.decode(prompts)) {
-        GGML_ASSERT(false && "Failed to decode token");
-    }
-
-    // Verify sequence 0 (backend sampled)
-    {
-        int32_t batch_idx = test_ctx.idx_for_seq(0);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("Seq 0 (backend) sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    }
-
-    // Verify sequence 1 (CPU sampled)
-    {
-        int32_t batch_idx = test_ctx.idx_for_seq(1);
-
-        llama_token backend_token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        GGML_ASSERT(backend_token == LLAMA_TOKEN_NULL);
-
-        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());
-
-        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("Seq 1 (CPU) sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    }
-
-    // Clear/remove the backend sampler, and sample again
-    {
-        // clear the backend sampler for seq 0 so that there are no backend
-        // samplers.
-        llama_set_sampler(test_ctx.ctx.get(), 0, nullptr);
-
-        // Create a CPU sampler and verify we can sampler from it.
-        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr chain(llama_sampler_chain_init(chain_params));
-        llama_sampler_chain_add(chain.get(), llama_sampler_init_greedy());
-
-        int32_t batch_idx = test_ctx.idx_for_seq(1);
-        llama_token token = llama_sampler_sample(chain.get(), test_ctx.ctx.get(), batch_idx);
-        if (!test_ctx.decode_token(token, 1)) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-    }
-
-    // Set a backend sampler so that we can verify that it can be reset
-    {
-        struct llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
-        llama_sampler_ptr sampler_chain(llama_sampler_chain_init(chain_params));
-        llama_sampler_chain_add(sampler_chain.get(), llama_sampler_init_dist(88));
-
-        llama_set_sampler(test_ctx.ctx.get(), 0, sampler_chain.get());
-
-        if (!test_ctx.decode_token(3834, 0)) {
-            GGML_ASSERT(false && "Failed to decode token");
-        }
-
-        int32_t batch_idx = test_ctx.idx_for_seq(0);
-        llama_token token = llama_get_sampled_token_ith(test_ctx.ctx.get(), batch_idx);
-        const std::string token_str = test_ctx.token_to_piece(token, false);
-        printf("re-added backend sampled token id=%d, string='%s'\n", token, token_str.c_str());
-        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
-    }
-
-    printf("backend-cpu mixed batch test PASSED\n");
-}
-
-static void test_backend_max_outputs(const test_params & params) {
-    const int seq_id = 0;
-    const int32_t seed = 88;
-
-    llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params();
-    llama_sampler_ptr backend_sampler_chain(llama_sampler_chain_init(backend_chain_params));
-    llama_sampler_chain_add(backend_sampler_chain.get(), llama_sampler_init_dist(seed));
-    std::vector<llama_sampler_seq_config> backend_sampler_configs = {{ seq_id, backend_sampler_chain.get() }};
-
-    test_context test_ctx(params, backend_sampler_configs);
-
-    llama_batch batch = llama_batch_init(512, 0, 1);
-    std::string prompt = "Hello";
-
-    std::vector<llama_token> tokens;
-    tokens.push_back(llama_vocab_bos(test_ctx.vocab));
-
-    std::vector<llama_token> prompt_tokens(32);
-    int n_tokens = llama_tokenize(test_ctx.vocab, prompt.c_str(), prompt.length(),
-                                   prompt_tokens.data(), prompt_tokens.size(),
-                                   false, false);
-    for (int i = 0; i < n_tokens; i++) {
-        tokens.push_back(prompt_tokens[i]);
-    }
-
-    for (size_t i = 0; i < tokens.size(); i++) {
-        // set all tokens as output to trigger error
-        common_batch_add(batch, tokens[i], i, { seq_id }, true);
-    }
-
-    printf(">>> test_max_outputs expected error start:\n");
-    const int ret = llama_decode(test_ctx.ctx.get(), batch);
-    GGML_ASSERT(ret != 0 && "llama_decode should not succeed multiple outputs per sequence");
-    printf("<<< test_max_outputs expected error end.\n");
-    llama_batch_free(batch);
-
-    printf("backend max outputs test PASSED\n");
-}
-
-struct backend_test_case {
-    std::string name;
-    void (*fn)(const test_params &);
-    bool enabled_by_default;
-};
-
-static const backend_test_case BACKEND_TESTS[] = {
-    { "greedy",          test_backend_greedy_sampling,         true  },
-    { "logit_bias",      test_backend_logit_bias_sampling,     true  },
-    { "temp",            test_backend_temp_sampling,           true  },
-    { "temp_ext",        test_backend_temp_ext_sampling,       true  },
-    { "top_k",           test_backend_top_k_sampling,          true  },
-    { "multi_sequence",  test_backend_multi_sequence_sampling, true  },
-    { "dist",            test_backend_dist_sampling,           true  },
-    { "dist_and_cpu",    test_backend_dist_sampling_and_cpu,   true  },
-    { "set_sampler",     test_backend_set_sampler,             true  },
-    { "max_outputs",     test_backend_max_outputs,             true  },
-    { "mixed",           test_backend_mixed_sampling,          true  },
-    { "min_p",           test_backend_min_p_sampling,          true  },
-    { "cpu_mixed",       test_backend_cpu_mixed_batch,         true  },
-    { "top_p",           test_backend_top_p_sampling,          true  },
-};
-
-static test_args parse_cli(int argc, char ** argv) {
-    test_args out;
-
-    for (int i = 1; i < argc; ++i) {
-        const char * arg = argv[i];
-
-        if (std::strcmp(arg, "--test") == 0) {
-            if (i + 1 >= argc) {
-                fprintf(stderr, "--test expects a value\n");
-                exit(EXIT_FAILURE);
-            }
-            out.test = argv[++i];
-            continue;
-        }
-        if (std::strncmp(arg, "--test=", 7) == 0) {
-            out.test = arg + 7;
-            continue;
-        }
-        if (std::strcmp(arg, "--model") == 0) {
-            if (i + 1 >= argc) {
-                fprintf(stderr, "--model expects a value\n");
-                exit(EXIT_FAILURE);
-            }
-            out.model = argv[++i];
-            continue;
-        }
-        if (std::strncmp(arg, "--model=", 8) == 0) {
-            out.model = arg + 8;
-            continue;
-        }
-        if (std::strcmp(arg, "--device") == 0) {
-            if (i + 1 >= argc) {
-                fprintf(stderr, "--device expects a value (cpu or gpu)\n");
-                exit(EXIT_FAILURE);
-            }
-            out.device = argv[++i];
-            continue;
-        }
-        if (std::strncmp(arg, "--device=", 9) == 0) {
-            out.device = arg + 9;
-            continue;
-        }
-        if (out.model.empty()) {
-            out.model = arg;
-            continue;
-        }
-
-        fprintf(stderr, "Unexpected argument: %s\n", arg);
-        exit(EXIT_FAILURE);
-    }
-
-    if (out.device != "cpu" && out.device != "gpu" && out.device != "auto") {
-        fprintf(stderr, "Invalid device '%s'. Must be 'cpu', 'gpu' or 'auto'\n", out.device.c_str());
-        exit(EXIT_FAILURE);
-    }
-
-    return out;
-}
-
-static std::vector<const backend_test_case *> collect_tests_to_run(const std::string & requested) {
-    std::vector<const backend_test_case *> selected;
-
-    if (!requested.empty()) {
-        for (const auto & test : BACKEND_TESTS) {
-            if (test.name == requested) {
-                selected.push_back(&test);
-                break;
-            }
-        }
-        if (selected.empty()) {
-            fprintf(stderr, "Unknown test '%s'. Available tests:\n", requested.c_str());
-            for (const auto & test : BACKEND_TESTS) {
-                fprintf(stderr, "  %s\n", test.name.c_str());
-            }
-            exit(EXIT_FAILURE);
-        }
-    } else {
-        for (const auto & test : BACKEND_TESTS) {
-            if (test.enabled_by_default) {
-                selected.push_back(&test);
-            }
-        }
-    }
-
-    if (selected.empty()) {
-        fprintf(stderr, "No backend sampling tests selected. Use --test=<name> to pick one.\n");
-    }
-
-    return selected;
-}
-
-static void run_tests(const std::vector<const backend_test_case *> & tests, const test_params & args) {
-    for (const auto & test : tests) {
-        fprintf(stderr, "\n=== %s ===\n", test->name.c_str());
-        try {
-            test->fn(args);
-        } catch (const std::exception & e) {
-            fprintf(stderr, "Error running test '%s': %s\n", test->name.c_str(), e.what());
-            exit(EXIT_FAILURE);
-        }
-    }
-}
-
-int main(int argc, char ** argv) {
-    test_args args = parse_cli(argc, argv);
-
-    if (args.model.empty()) {
-        args.model = get_model_or_exit(1, argv);
-    }
-
-    {
-        std::ifstream file(args.model);
-        if (!file.is_open()) {
-            fprintf(stderr, "no model '%s' found\n", args.model.c_str());
-            return EXIT_FAILURE;
-        }
-    }
-
-    fprintf(stderr, "using '%s'\n", args.model.c_str());
-
-    llama_backend_init();
-
-    test_params params = {
-        /*.model =*/ load_model(args),
-    };
-
-    const std::vector<const backend_test_case *> tests = collect_tests_to_run(args.test);
-    if (!tests.empty()) {
-        run_tests(tests, params);
-    }
-
-    return 0;
-}