Merge branch 'upstream' into concedo_experimental

# Conflicts: # Makefile # README.md # common/common.h
2025-09-11 09:34:37 +00:00 · 2024-03-17 23:03:12 +08:00 · 2024-03-17 23:03:12 +08:00 · 8b360b661c
commit 8b360b661c
parent 5410e4644a c47cf414ef
24 changed files with 1696 additions and 579 deletions
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "42 0 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -31,10 +31,15 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

+// build info
+
+struct llama_control_vector_load_info;
+
+int32_t get_num_physical_cores();
+
 //
 // CLI argument parsing
 //
-int32_t get_num_physical_cores();

 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@ -117,6 +122,11 @@ struct gpt_params {
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter

+    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
+
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
@ -283,3 +293,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 void llama_embd_normalize(const float * inp, float * out, int n);

 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct llama_control_vector_data {
+    int n_embd;
+
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+
+struct llama_control_vector_load_info {
+    float strength;
+
+    std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1965,6 +1965,23 @@ class MambaModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+@Model.register("CohereForCausalLM")
+class CommandR2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COMMAND_R
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
 ###### CONVERSION LOGIC ######


--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+```
+
+Run the example using the downloaded model:
+```console
+$ ./gritlm -m gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -8,6 +8,7 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <numeric>
@ -1124,15 +1125,19 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

-    //std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
-    //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
-    //GGML_UNUSED(n_batch);
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    std::vector<llama_token> tokens(n_batch);

-    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
    int n_processed = 0;

    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+        for (int i = 1; i < n_tokens; i++) {
+            tokens[i] = std::rand() % n_vocab;
+        }
        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
        n_processed += n_tokens;
    }
@ -1143,11 +1148,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

-    llama_token token = llama_token_bos(llama_get_model(ctx));
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;

    for (int i = 0; i < n_gen; i++) {
        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
        llama_synchronize(ctx);
+        token = std::rand() % n_vocab;
    }
 }

--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {

 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }

@ -1497,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im

 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        pad_to_square = false;
    }
    // free the previous res_imgs if any set
-    if (res_imgs.size > 0) {
+    if (res_imgs->size > 0) {
        clip_image_f32_batch_free(res_imgs);
    }
-    res_imgs.data = nullptr;
-    res_imgs.size = 0;
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            patches.insert(patches.begin(), image_original_resize);
            // clip_image_f32_batch_init(patches.size());
-            res_imgs.size = patches.size();
-            res_imgs.data = new clip_image_f32[res_imgs.size];
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
            int num=0;
            for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
                num++;
            }

@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
    // }
    // res_imgs.push_back(res);

-    res_imgs.size = 1;
-    res_imgs.data = new clip_image_f32[res_imgs.size];
-    res_imgs.data[0] = *res;
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
    clip_image_f32_free(res);

    return true;
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();

 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    clip_image_f32_batch img_res_v;
    img_res_v.size = 0;
    img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@ -29,9 +29,9 @@ struct llava_image_embed {
 };

 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);

-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);

 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

-#build example/main only
+#build example/main
 #cmake --build . --config Release --target main

+#build example/llama-bench
+#cmake --build . --config Release --target llama-bench
+
 #build all binary
 cmake --build . --config Release -v
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh

 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
+    GGML_SYCL_SINGLE_GPU=1
 else
    GGML_SYCL_DEVICE=0
 fi
-echo "use $GGML_SYCL_DEVICE as main GPU"
+
 #export GGML_SYCL_DEBUG=1


 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.

-#use all GPUs with same max compute units
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
+    echo "use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+fi

 #use main GPU only
 #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

+#use multiple GPUs with same max compute units
+#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -11539,6 +11539,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
    if (ggml_backend_is_cuda(event->backend)) {
        CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
    } else {
+#if 0
        // untested
        auto wait_fn = [](void * user_data) {
            ggml_backend_event_t event = (ggml_backend_event_t)user_data;
@ -11546,6 +11547,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
        };

        CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
+#endif
+        GGML_ASSERT(false);
    }
 }

--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -16,6 +16,7 @@
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <float.h>
 #include <limits>
 #include <stdint.h>
@ -24,10 +25,9 @@
 #include <cmath>
 #include <iostream>
 #include <fstream>
-
 #include <stdio.h>
 #include <stdlib.h>
-
+#include <regex>

 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
 #define __dpct_noinline__ __attribute__((noinline))
 #endif

+
+std::string get_device_type_name(const sycl::device &Device) {
+    auto DeviceType = Device.get_info<sycl::info::device::device_type>();
+    switch (DeviceType) {
+    case sycl::info::device_type::cpu:
+        return "cpu";
+    case sycl::info::device_type::gpu:
+        return "gpu";
+    case sycl::info::device_type::host:
+        return "host";
+    case sycl::info::device_type::accelerator:
+        return "acc";
+    default:
+        return "unknown";
+    }
+}
+
+std::string get_device_backend_and_type(const sycl::device &device) {
+    std::stringstream device_type;
+    sycl::backend backend = device.get_backend();
+    device_type <<  backend << ":" << get_device_type_name(device);
+    return device_type.str();
+}
+
 namespace dpct
 {
    typedef sycl::queue *queue_ptr;
@ -942,17 +966,65 @@ namespace dpct

    private:
        mutable std::recursive_mutex m_mutex;
+        static bool compare_dev(sycl::device &device1, sycl::device &device2)
+        {
+            dpct::device_info prop1;
+            dpct::get_device_info(prop1, device1);
+            dpct::device_info prop2;
+            dpct::get_device_info(prop2, device2);
+            return prop1.get_max_compute_units() > prop2.get_max_compute_units();
+        }
+        static int convert_backend_index(std::string & backend) {
+            if (backend == "ext_oneapi_level_zero:gpu") return 0;
+            if (backend == "opencl:gpu") return 1;
+            if (backend == "opencl:cpu") return 2;
+            if (backend == "opencl:acc") return 3;
+            printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
+            GGML_ASSERT(false);
+        }
+        static bool compare_backend(std::string &backend1, std::string &backend2) {
+            return convert_backend_index(backend1) < convert_backend_index(backend2);
+        }
        dev_mgr()
        {
            sycl::device default_device =
                sycl::device(sycl::default_selector_v);
            _devs.push_back(std::make_shared<device_ext>(default_device));

-            std::vector<sycl::device> sycl_all_devs =
-                sycl::device::get_devices(sycl::info::device_type::all);
+            std::vector<sycl::device> sycl_all_devs;
            // Collect other devices except for the default device.
            if (default_device.is_cpu())
                _cpu_device = 0;
+
+            auto Platforms = sycl::platform::get_platforms();
+            // Keep track of the number of devices per backend
+            std::map<sycl::backend, size_t> DeviceNums;
+            std::map<std::string, std::vector<sycl::device>> backend_devices;
+
+            while (!Platforms.empty()) {
+                auto Platform = Platforms.back();
+                Platforms.pop_back();
+                auto devices = Platform.get_devices();
+                std::string backend_type = get_device_backend_and_type(devices[0]);
+                for (const auto &device : devices) {
+                    backend_devices[backend_type].push_back(device);
+                }
+            }
+
+            std::vector<std::string> keys;
+            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+                keys.push_back(it->first);
+            }
+            std::sort(keys.begin(), keys.end(), compare_backend);
+
+            for (auto &key : keys) {
+                std::vector<sycl::device> devs = backend_devices[key];
+                std::sort(devs.begin(), devs.end(), compare_dev);
+                for (const auto &dev : devs) {
+                    sycl_all_devs.push_back(dev);
+                }
+            }
+
            for (auto &dev : sycl_all_devs)
            {
                if (dev == default_device)
@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
 #define GGML_SYCL_MMV_Y 1
 #endif

+enum ggml_sycl_backend_gpu_mode {
+    SYCL_UNSET_GPU_MODE = -1,
+    SYCL_SINGLE_GPU_MODE = 0,
+    SYCL_MUL_GPU_MODE
+};

 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");

@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
        int work_group_size = 0;
        std::string gpus_list = "";

+        /*
+        Use all GPUs with same top max compute units
+        */
        sycl_gpu_mgr() {
            detect_sycl_gpu_list_with_max_cu();
            get_allow_gpus();
            create_context_with_gpus();
        }

+        /*
+        Only use the assigned GPU
+        */
+        sycl_gpu_mgr(int main_gpu_id) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            gpus.push_back(main_gpu_id);
+            devices.push_back(device);
+            work_group_size = prop.get_max_work_group_size();
+            max_compute_units = prop.get_max_compute_units();
+
+            get_allow_gpus();
+            create_context_with_gpus();
+        }
+
        void create_context_with_gpus() {
            sycl::context ctx = sycl::context(devices);
            assert(gpus.size() > 0);
@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
                gpus_list += std::to_string(gpus[i]);
                gpus_list += ",";
            }
-            if (gpus_list.length() > 2) {
+            if (gpus_list.length() > 1) {
                gpus_list.pop_back();
            }
        }
@ -3451,7 +3547,7 @@ class sycl_gpu_mgr {
                dpct::device_info prop;
                dpct::get_device_info(prop, device);
                if (max_compute_units == prop.get_max_compute_units() &&
-                    prop.get_major_version() == 1) {
+                    is_ext_oneapi_device(device)) {
                    gpus.push_back(id);
                    devices.push_back(device);
                    work_group_size = prop.get_max_work_group_size();
@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
                if (gpus[i] == id)
                    return i;
            }
-            assert(false);
-            return -1;
+            printf("miss to get device index by id=%d\n", id);
+            GGML_ASSERT(false);
        }

        int get_next_index(int id) {
@ -3481,8 +3577,16 @@ class sycl_gpu_mgr {
                if (gpus[i] == id)
                    return i;
            }
-            assert(false);
-            return -1;
+            GGML_ASSERT(false);
+        }
+
+        bool is_ext_oneapi_device(const sycl::device &dev) {
+            sycl::backend dev_backend = dev.get_backend();
+            if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+                dev_backend == sycl::backend::ext_oneapi_cuda ||
+                dev_backend == sycl::backend::ext_oneapi_hip)
+                return true;
+            return false;
        }
 };

@ -3491,11 +3595,14 @@ static int g_device_count = -1;
 static int g_all_sycl_device_count = -1;
 static int g_main_device = -1;
 static int g_main_device_id = -1;
+static bool g_ggml_backend_sycl_buffer_type_initialized = false;

 static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};

 static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};

+static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
+
 struct sycl_device_capabilities {
    int     cc;                 // compute capability
    bool    vmm;                // virtual memory support
@ -12999,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
    return g_sycl_loaded;
 }

-void print_device_detail(int id) {
+void print_device_detail(int id, sycl::device &device, std::string device_type) {
+
    dpct::device_info prop;
    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
-    sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
+        dpct::get_device_info(prop, device)));
+
    std::string version;
    version += std::to_string(prop.get_major_version());
    version += ".";
    version += std::to_string(prop.get_minor_version());

-    fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+
+    fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
            prop.get_name(), version.c_str(), prop.get_max_compute_units(),
            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
            prop.get_global_mem_size());
@ -13017,16 +13127,32 @@ void print_device_detail(int id) {

 void ggml_backend_sycl_print_sycl_devices() {
    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|ID| Name                                        |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
-    fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
+    fprintf(stderr, "|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |\n");
+    fprintf(stderr, "|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|\n");
+    fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
    for (int id = 0; id < device_count; ++id) {
-        print_device_detail(id);
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail(id, device, device_type.str());
    }
 }

 void print_gpu_device_list() {
-    fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
+    GGML_ASSERT(g_sycl_gpu_mgr);
+
+    char* hint=NULL;
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
+        hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
+    } else {
+        hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
+    }
+    fprintf(stderr, hint,
        g_sycl_gpu_mgr->get_gpu_count(),
        g_sycl_gpu_mgr->gpus_list.c_str(),
        g_sycl_gpu_mgr->max_compute_units);
@ -13065,23 +13191,6 @@ void ggml_init_sycl() try {
 #else
        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
 #endif
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-
-        if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-
-        g_device_count = g_sycl_gpu_mgr->get_gpu_count();
-        g_work_group_size = g_sycl_gpu_mgr->work_group_size;
-
-        print_gpu_device_list();
-
-        int64_t total_vram = 0;

 /* NOT REMOVE, keep it for next optimize for XMX.
 #if defined(SYCL_USE_XMX)
@ -13090,6 +13199,33 @@ void ggml_init_sycl() try {
        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 */
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                            dpct::dev_mgr::instance().device_count()) != 0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
+        initialized = true;
+        g_sycl_loaded = true;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_init_by_gpus(int device_count) try {
+    g_device_count = device_count;
+    g_work_group_size = g_sycl_gpu_mgr->work_group_size;
+
+    int64_t total_vram = 0;
+
+    print_gpu_device_list();
+
    for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
        g_device_caps[id].vmm = 0;
        g_device_caps[id].device_id = -1;
@ -13132,10 +13268,6 @@ void ggml_init_sycl() try {
        // create sycl handle
        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
    }
-
-        initialized = true;
-        g_sycl_loaded = true;
-    }
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -16542,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
    /* .is_host          = */ nullptr,
 };

-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
+    if (device_index>=g_device_count or device_index<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device_index, g_device_count-1);
+        GGML_ASSERT(device_index<g_device_count);
+    }
    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
+    if (!g_ggml_backend_sycl_buffer_type_initialized) {
        for (int i = 0; i < g_device_count; i++) {
            ggml_backend_sycl_buffer_types[i] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
            };
        }
-        ggml_backend_sycl_buffer_type_initialized = true;
+        g_ggml_backend_sycl_buffer_type_initialized = true;
    }
-
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_index];
 }

 // sycl split buffer type
@ -17310,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
    return g_sycl_gpu_mgr->get_index(device_id);
 }

+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
+    return g_sycl_gpu_mgr->gpus[device_index];
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
+    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
+    g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
+        return;
+    }
+
+    fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
+
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr();
+    g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
 extern "C" int ggml_backend_sycl_reg_devices();

 int ggml_backend_sycl_reg_devices() {
-    if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-    g_device_count = g_sycl_gpu_mgr->get_gpu_count();
+    ggml_backend_sycl_set_mul_device_mode();
    assert(g_device_count>0);
    for (int i = 0; i < g_device_count; i++) {
        int id = g_sycl_gpu_mgr->gpus[i];
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);

+// TODO: these are temporary
+//       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
    },
+    [GGML_TYPE_I64] = {
+        .type_name                = "i64",
+        .blck_size                = 1,
+        .type_size                = sizeof(int64_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F64] = {
+        .type_name                = "f64",
+        .blck_size                = 1,
+        .type_size                = sizeof(double),
+        .is_quantized             = false,
+        .nrows                    = 1,
+    },
    [GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
@ -918,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) {
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 #endif

+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = _mm512_reduce_add_ps(x[0]);                                 \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = _mm512_reduce_add_ps(x[0]);                             \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+
 #elif defined(__AVX__)

 #define GGML_SIMD
@ -12419,6 +12527,8 @@ static void ggml_compute_forward_alibi(
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
@ -12505,6 +12615,8 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
--- a/ggml.h
+++ b/ggml.h
@ -373,6 +373,8 @@ extern "C" {
        GGML_TYPE_I8      = 24,
        GGML_TYPE_I16     = 25,
        GGML_TYPE_I32     = 26,
+        GGML_TYPE_I64     = 27,
+        GGML_TYPE_F64     = 28,
        GGML_TYPE_COUNT,
    };

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -42,6 +42,7 @@ class Keys:
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
        POOLING_TYPE          = "{arch}.pooling_type"
+        LOGIT_SCALE           = "{arch}.logit_scale"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum):
    GEMMA      = auto()
    STARCODER2 = auto()
    MAMBA      = auto()
+    COMMAND_R  = auto()


 class MODEL_TENSOR(IntEnum):
@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.GEMMA:          "gemma",
    MODEL_ARCH.STARCODER2:     "starcoder2",
    MODEL_ARCH.MAMBA:          "mamba",
+    MODEL_ARCH.COMMAND_R:      "command-r",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.SSM_D,
        MODEL_TENSOR.SSM_OUT,
    ],
+    MODEL_ARCH.COMMAND_R: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    # TODO
 }

@ -665,6 +680,8 @@ class GGMLQuantizationType(IntEnum):
    I8      = 24
    I16     = 25
    I32     = 26
+    I64     = 27
+    F64     = 28


 class GGUFEndian(IntEnum):
@ -734,6 +751,8 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.I8:      (1, 1),
    GGMLQuantizationType.I16:     (1, 2),
    GGMLQuantizationType.I32:     (1, 4),
+    GGMLQuantizationType.I64:     (1, 8),
+    GGMLQuantizationType.F64:     (1, 8),
 }


--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -242,12 +242,15 @@ class GGUFReader:
            n_bytes = n_elems * type_size // block_size
            data_offs = int(start_offs + offset_tensor[0])
            item_type: npt.DTypeLike
-            if ggml_type == GGMLQuantizationType.F32:
-                item_count = n_elems
-                item_type = np.float32
-            elif ggml_type == GGMLQuantizationType.F16:
+            if ggml_type == GGMLQuantizationType.F16:
                item_count = n_elems
                item_type = np.float16
+            elif ggml_type == GGMLQuantizationType.F32:
+                item_count = n_elems
+                item_type = np.float32
+            elif ggml_type == GGMLQuantizationType.F64:
+                item_count = n_elems
+                item_type = np.float64
            elif ggml_type == GGMLQuantizationType.I8:
                item_count = n_elems
                item_type = np.int8
@ -257,6 +260,9 @@ class GGUFReader:
            elif ggml_type == GGMLQuantizationType.I32:
                item_count = n_elems
                item_type = np.int32
+            elif ggml_type == GGMLQuantizationType.I64:
+                item_count = n_elems
+                item_type = np.int64
            else:
                item_count = n_bytes
                item_type = np.uint8
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -204,18 +204,22 @@ class GGUFWriter:
        for i in range(n_dims):
            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
-            if tensor_dtype == np.float32:
-                dtype = GGMLQuantizationType.F32
-            elif tensor_dtype == np.float16:
+            if tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
+            elif tensor_dtype == np.float32:
+                dtype = GGMLQuantizationType.F32
+            elif tensor_dtype == np.float64:
+                dtype = GGMLQuantizationType.F64
            elif tensor_dtype == np.int8:
                dtype = GGMLQuantizationType.I8
            elif tensor_dtype == np.int16:
                dtype = GGMLQuantizationType.I16
            elif tensor_dtype == np.int32:
                dtype = GGMLQuantizationType.I32
+            elif tensor_dtype == np.int64:
+                dtype = GGMLQuantizationType.I64
            else:
-                raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
+                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
        self.ti_data += self._pack("I", dtype)
@ -357,6 +361,9 @@ class GGUFWriter:
    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)

+    def add_logit_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
+
    def add_expert_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)

--- a/klite.embd
+++ b/klite.embd
@ -7,7 +7,7 @@ Just copy this single static HTML file anywhere and open it in a browser, or fro
 Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite.
 If you are submitting a pull request for Lite, PLEASE use the above repo, not the KoboldCpp one.
 Kobold Lite is under the AGPL v3.0 License unless otherwise exempted. Please do not remove this line.
-Current version: 124
+Current version: 125
 -Concedo
 -->

@ -373,21 +373,6 @@ Current version: 124
 			margin-top: 100px;
 		}

-
-		.loadpopup {
-			width: 600px;
-			background-color: #262626;
-			margin-top: 150px;
-		}
-
-		@media (max-width: 768px) {
-			.loadpopup {
-				width: 100%;
-				background-color: #262626;
-				margin-top: 150px;
-			}
-		}
-
 		.workerpopup {
 			background-color: #262626;
 			margin-top: 170px;
@ -420,7 +405,7 @@ Current version: 124
 			width: 330px;
 		}
 		.nspopup.flexsize {
-			width: 540px;
+			width: 600px;
 		}
 		@media (max-width: 620px) {
 			.nspopup.flexsize {
@ -2966,7 +2951,7 @@ Current version: 124
 		}
 	}

-	function apply_proxy_url(url)
+	function apply_proxy_url(url, proxy_by_default=false)
 	{
 		let proxy_part = "";

@ -2980,7 +2965,7 @@ Current version: 124
 				!url.toLowerCase().includes("."));
 		}

-		if (uses_cors_proxy && !is_local) {
+		if ((uses_cors_proxy||proxy_by_default) && !is_local) {
 			proxy_part = cors_proxy + "?";
 		}
 		return proxy_part + url;
@ -3357,7 +3342,7 @@ Current version: 124

 	const text_hordes = [
 		{
-			baseurl: "https://horde.koboldai.net",
+			baseurl: "https://aihorde.net",
 			tag: "🤖",
 			sort_order: 1,
 			client_agent: default_client_agent,
@ -3415,8 +3400,6 @@ Current version: 124

 	const default_oai_image_endpoint = "/images/generations";

-	const scale_submit_endpoint = "https://dashboard.scale.com/spellbook/api/v2/deploy/"
-
 	const claude_submit_endpoint = "/complete";
 	const claude_submit_endpoint_v3 = "/messages";

@ -3498,9 +3481,7 @@ Current version: 124
 	var custom_oai_endpoint = "";
 	var custom_oai_key = ""; //if set, uses the OpenAI API to generate
 	var custom_oai_model = "";
-	var custom_scale_key = "";
 	var custom_palm_key = "";
-	var custom_scale_ID = "";
 	var custom_claude_endpoint = "";
 	var custom_claude_key = "";
 	var custom_claude_model = "";
@ -3826,8 +3807,8 @@ Current version: 124
 					if(!localflag && pending_eptype>0)
 					{
 						msgboxYesNo("Reconnect to previous custom endpoint?","Custom Endpoint Reconnect",()=>{
-							document.getElementById("customapidropdown").value = (pending_eptype - 1).toString();
-							display_custom_endpoint();
+							document.getElementById("customapidropdown").value = (pending_eptype).toString();
+							display_endpoint_container();
 						},null);
 					}
 				}
@ -4032,7 +4013,7 @@ Current version: 124
 	function attempt_connect(popup_aiselect = true)
 	{
 		if (localflag) {
-			document.getElementById("customapidropdown").value = 0;
+			document.getElementById("customapidropdown").value = 1;
 			let protocol = "http://";
 			if(window.location.protocol.includes('https') && !is_using_web_lite())
 			{
@ -4077,7 +4058,7 @@ Current version: 124
 						}
 					}
 					document.body.classList.add("connected");
-					document.getElementById("connectstatus").innerHTML = "Connected to KoboldAI Horde";
+					document.getElementById("connectstatus").innerHTML = "Connected to AI Horde";
 					document.getElementById("connectstatus").classList.remove("color_orange");
 					document.getElementById("connectstatus").classList.add("color_green");
 					render_gametext(false);
@ -4085,12 +4066,12 @@ Current version: 124
 					read_url_params_data();

 					if (popup_aiselect) {
-						display_models();
+						display_endpoint_container();
 					}

 				}
 				else {
-					msgbox("Failed to connect to KAI Horde!\nPlease check your network connection.");
+					msgbox("Failed to connect to AI Service!\nPlease check your network connection.");
 					document.body.classList.remove("connected");
 					document.getElementById("connectstatus").innerHTML = "Offline Mode";
 					document.getElementById("connectstatus").classList.add("color_orange");
@ -4475,7 +4456,7 @@ Current version: 124

 	function is_using_custom_ep()
 	{
-		return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_scale_key!=""||custom_claude_key!=""||custom_palm_key!="");
+		return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_claude_key!=""||custom_palm_key!="");
 	}

 	function is_using_kcpp_with_streaming()
@ -5517,7 +5498,7 @@ Current version: 124
 				//remove common malformed ids to reduce load
 				if(userinput!="" && isNumeric(userinput) && userinput>0 && userinput<50000)
 				{
-					fetch(cors_proxy+"?https://aetherroom.club/api/"+userinput)
+					fetch(apply_proxy_url("https://aetherroom.club/api/"+userinput,true))
 					.then(x => x.json())
 					.then(data => {
 						console.log(data);
@ -5718,12 +5699,13 @@ Current version: 124
 					};

 					document.getElementById("scenariodesc").innerText = "Loading scenario from Pygmalion.Chat...";
-					fetch(cors_proxy+"?https://server.pygmalion.chat/galatea.v1.PublicCharacterService/CharacterExport", {
-					method: 'POST',
+					let charurl = "https://server.pygmalion.chat/api/export/character/"+userinput+"/v2";
+					fetch(apply_proxy_url(charurl,true), {
+					method: 'GET',
 					headers: {
 						'Content-Type': 'application/json',
 					},
-					body: JSON.stringify({ "character_id": userinput }),
+				//	body: JSON.stringify({ "character_id": userinput }),
 					referrerPolicy: 'no-referrer',
 					})
 					.then(x => {
@ -5738,12 +5720,12 @@ Current version: 124
 					})
 					.then(data => {
 						console.log(data);
-						if(data && data.card) //if fetch was successful
+						if(data && data.character) //if fetch was successful
 						{
-							load_temp_scenario_from_tavernobj(data.card,true);
-							if(data.card.data && data.card.data.avatar)
+							load_temp_scenario_from_tavernobj(data.character,true);
+							if(data.character.data && data.character.data.avatar)
 							{
-								const compressedImg = compressImage(data.card.data.avatar, (compressedImageURI, aspectratio)=>{
+								const compressedImg = compressImage(data.character.data.avatar, (compressedImageURI, aspectratio)=>{
 									temp_scenario.image = compressedImageURI;
 									temp_scenario.image_aspect = aspectratio;
 									preview_temp_scenario();
@ -6433,7 +6415,6 @@ Current version: 124
 	{
 		return !(
 			document.getElementById("saveloadcontainer").classList.contains("hidden") &&
-			document.getElementById("loadmodelcontainer").classList.contains("hidden") &&
 			document.getElementById("newgamecontainer").classList.contains("hidden") &&
 			document.getElementById("yesnocontainer").classList.contains("hidden") &&
 			document.getElementById("settingscontainer").classList.contains("hidden") &&
@ -6456,7 +6437,6 @@ Current version: 124
 	}
 	function hide_popups() {
 		document.getElementById("saveloadcontainer").classList.add("hidden");
-		document.getElementById("loadmodelcontainer").classList.add("hidden");
 		document.getElementById("newgamecontainer").classList.add("hidden");
 		document.getElementById("yesnocontainer").classList.add("hidden");
 		document.getElementById("settingscontainer").classList.add("hidden");
@ -6525,7 +6505,7 @@ Current version: 124

 	function explain_horde()
 	{
-		msgbox("The AI Horde generates text using crowdsourced GPUs by volunteer workers. By default your inputs are not logged, but as Horde workers are open source, they can be modified to do so. <br><br>In all cases, the sender will *always be anonymous*, however you are still advised to avoid sending privacy sensitive information.<br><br>For any issues, you can find us on discord at <a class=\"color_blueurl\" href=\"https://koboldai.org/discord\">https://koboldai.org/discord</a>","Disclaimer",true);
+		msgbox("The AI Horde generates text using crowdsourced GPUs by volunteer workers. By default your inputs are not logged, but as Horde workers are open source, they can be modified to do so. <br><br>In all cases, the sender will *always be anonymous*, however you are still advised to avoid sending privacy sensitive information.<br>","Disclaimer",true);
 	}

 	function selectImgStyle()
@ -6725,7 +6705,7 @@ Current version: 124

 	function select_custom_oai_model()
 	{
-		let isOpenrouter = (document.getElementById("customapidropdown").value==5);
+		let isOpenrouter = (document.getElementById("customapidropdown").value==3);
 		inputBox("Enter custom model name","Custom Model Name",localsettings.saved_oai_custommodel,"", ()=>{
 			let coai = getInputBoxValue().trim();
 			let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model"));
@ -6742,7 +6722,7 @@ Current version: 124
 	}
 	function oai_model_change()
 	{
-		let isOpenrouter = (document.getElementById("customapidropdown").value==5);
+		let isOpenrouter = (document.getElementById("customapidropdown").value==3);
 		let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model"));
 		let non_completions = (dropdown.value.includes("davinci-002") || dropdown.value.includes("text-davinci-003") || dropdown.value.includes("text-davinci-002")
 		|| dropdown.value.includes("text-davinci-001") || dropdown.value.includes("gpt-3.5-turbo-instruct") || dropdown.value == "davinci");
@ -6790,7 +6770,7 @@ Current version: 124

 			if (!data.error && data.data && data.data.length > 0)
 			{
-				let isOpenrouter = (document.getElementById("customapidropdown").value==5);
+				let isOpenrouter = (document.getElementById("customapidropdown").value==3);
 				let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model"));
 				var lastOption = dropdown.lastElementChild;
 				for (var i = dropdown.options.length - 1; i >= 0; i--) {
@ -6848,12 +6828,18 @@ Current version: 124
 		let epchoice = document.getElementById("customapidropdown").value;
 		document.getElementById("oaicustom").classList.add("hidden");
 		document.getElementById("koboldcustom").classList.add("hidden");
-		document.getElementById("scalecustom").classList.add("hidden");
 		document.getElementById("claudecustom").classList.add("hidden");
 		document.getElementById("palmcustom").classList.add("hidden");
 		document.getElementById("custom_oai_model").classList.add("hidden");
 		document.getElementById("custom_openrouter_model").classList.add("hidden");
+		document.getElementById("hordeloadmodelcontainer").classList.add("hidden");
+
 		if(epchoice==0)
+		{
+			document.getElementById("hordeloadmodelcontainer").classList.remove("hidden");
+			display_horde_models();
+		}
+		else if(epchoice==1)
 		{
 			document.getElementById("koboldcustom").classList.remove("hidden");
 			if(!localflag)
@ -6862,10 +6848,19 @@ Current version: 124
 				document.getElementById("customkoboldkey").value = localsettings.saved_kai_key;
 			}
 		}
-		else if(epchoice==1 || epchoice==5)
+		else if(epchoice==2 || epchoice==3)
 		{
 			document.getElementById("oaicustom").classList.remove("hidden");
-			if(epchoice==5)
+			if(epchoice==2)
+			{
+				document.getElementById("oaidesc").classList.remove("hidden");
+				document.getElementById("custom_oai_model").classList.remove("hidden");
+				document.getElementById("openrouterdesc").classList.add("hidden");
+				document.getElementById("custom_oai_endpoint").classList.remove("hidden");
+				document.getElementById("custom_oai_key").value = localsettings.saved_oai_key;
+				document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr?localsettings.saved_oai_addr:default_oai_base);
+			}
+			else
 			{
 				document.getElementById("oaidesc").classList.add("hidden");
 				document.getElementById("openrouterdesc").classList.remove("hidden");
@ -6883,24 +6878,10 @@ Current version: 124
 					}
 				}
 			}
-			else
-			{
-				document.getElementById("oaidesc").classList.remove("hidden");
-				document.getElementById("custom_oai_model").classList.remove("hidden");
-				document.getElementById("openrouterdesc").classList.add("hidden");
-				document.getElementById("custom_oai_endpoint").classList.remove("hidden");
-				document.getElementById("custom_oai_key").value = localsettings.saved_oai_key;
-				document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr?localsettings.saved_oai_addr:default_oai_base);
-
-			}
 			oai_model_change();
 			toggleoaichatcompl();
 		}
-		else if(epchoice==2)
-		{
-			document.getElementById("scalecustom").classList.remove("hidden");
-		}
-		else if(epchoice==3)
+		else if(epchoice==4)
 		{
 			toggleclaudemodel();
 			document.getElementById("claudecustom").classList.remove("hidden");
@ -6909,7 +6890,7 @@ Current version: 124
 			document.getElementById("claudesystemprompt").value = localsettings.saved_claude_jailbreak;
 			document.getElementById("claudejailbreakprompt").value = localsettings.saved_claude_jailbreak2;
 		}
-		else if(epchoice==4)
+		else if(epchoice==5)
 		{
 			document.getElementById("palmcustom").classList.remove("hidden");
 			document.getElementById("custom_palm_key").value = localsettings.saved_palm_key;
@ -6952,12 +6933,15 @@ Current version: 124
 		custom_kobold_endpoint = "";
 		custom_kobold_key = "";
 		custom_oai_key = "";
-		custom_scale_key = "";
 		custom_claude_key = "";
 		custom_palm_key = "";

 		let epchoice = document.getElementById("customapidropdown").value;
-		if(epchoice==0) //connect to kobold endpoint
+		if(epchoice==0) //ai horde
+		{
+			confirm_horde_models();
+		}
+		else if(epchoice==1) //connect to kobold endpoint
 		{
 			let desiredkoboldendpoint = document.getElementById("customkoboldendpoint").value;
 			let desiredkoboldkey = document.getElementById("customkoboldkey").value;
@ -7203,7 +7187,7 @@ Current version: 124
 					});
 			}
 		}
-		else if(epchoice==1 || epchoice==5) //connect to OAI / OpenRouter Endpoint
+		else if(epchoice==2 || epchoice==3) //connect to OAI / OpenRouter Endpoint
 		{
 			let desired_oai_key = document.getElementById("custom_oai_key").value.trim();
 			let desired_oai_ep = document.getElementById("custom_oai_endpoint").value.trim();
@ -7230,7 +7214,7 @@ Current version: 124
 				//good to go
 				custom_oai_endpoint = desired_oai_ep;
 				custom_oai_key = desired_oai_key;
-				if(epchoice==1)
+				if(epchoice==2)
 				{
 					localsettings.saved_oai_key = custom_oai_key;
 					localsettings.saved_oai_addr = custom_oai_endpoint;
@ -7246,7 +7230,7 @@ Current version: 124
 				}
 				localsettings.saved_oai_role = document.getElementById("oairoledropdown").value;
 				localsettings.saved_oai_jailbreak2 = document.getElementById("jailbreakprompttext2").value;
-				let isOpenrouter = (document.getElementById("customapidropdown").value==5);
+				let isOpenrouter = (document.getElementById("customapidropdown").value==3);
 				let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model"));
 				custom_oai_model = dropdown.value.trim();
 				localsettings.saved_oai_custommodel = custom_oai_model;
@ -7268,74 +7252,7 @@ Current version: 124
 				render_gametext(true);
 			}
 		}
-		else if(epchoice==2) //connect to Scale Endpoint
-		{
-			let desired_scale_key = document.getElementById("custom_scale_key").value.trim();
-			let desired_scale_ID = document.getElementById("custom_scale_ID").value.trim();
-
-			desired_scale_ID = desired_scale_ID.split("#")[0];
-			desired_scale_ID = desired_scale_ID.split("?")[0];
-			if(desired_scale_ID.includes("dashboard.scale.com/spellbook/api/v2/deploy/") &&
-			desired_scale_key.length == 25 &&!desired_scale_key.includes(" ")&&!desired_scale_key.includes("/"))
-			{
-				desired_scale_ID = desired_scale_ID.split("dashboard.scale.com/spellbook/api/v2/deploy/")[1];
-			}
-			else
-			{
-				desired_scale_ID = "";
-				desired_scale_key = "";
-				msgbox("Invalid inputs, please try again.");
-			}
-
-			if(desired_scale_key!="" && desired_scale_ID!="")
-			{
-				hide_popups();
-				fetch(cors_proxy+"?"+ scale_submit_endpoint+desired_scale_ID, {
-					method: 'GET',
-					headers: {
-						'Authorization': 'Bearer '+desired_scale_key,
-					},
-					referrerPolicy: 'no-referrer',
-				})
-				.then((response) => response.json())
-				.then((data) => {
-					console.log(data);
-					if (data.message && data.message!="")
-					{
-						//good to go
-						custom_scale_key = desired_scale_key;
-						custom_scale_ID = desired_scale_ID;
-						selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": "SpellbookScaleAI", "count": 1 }];
-						selected_workers = [];
-						if (perfdata == null) {
-							//generate some fake perf data if horde is offline and using custom endpoint
-							perfdata = {
-								"queued_requests": 0,
-								"queued_tokens": 0,
-								"past_minute_tokens": 0,
-								"worker_count": 0
-							};
-							document.body.classList.add("connected");
-							document.getElementById("connectstatus").classList.remove("color_orange");
-							document.getElementById("connectstatus").classList.add("color_green");
-						}
-						document.getElementById("connectstatus").innerHTML = "Connected to ScaleAI Endpoint";
-						render_gametext();
-					}
-					else
-					{
-						custom_scale_key = "";
-						msgbox("Cannot connect to Spellbook by ScaleAI");
-					}
-				})
-				.catch(error => {
-				console.log("Error: " + error);
-				custom_scale_key = "";
-				msgbox("Error: " + error);
-				});
-			}
-		}
-		else if(epchoice==3) //claude endpoint
+		else if(epchoice==4) //claude endpoint
 		{
 			let desired_claude_key = document.getElementById("custom_claude_key").value.trim();
 			let desired_claude_ep = document.getElementById("custom_claude_endpoint").value.trim();
@ -7387,7 +7304,7 @@ Current version: 124

 			}
 		}
-		else if(epchoice==4) //palm endpoint
+		else if(epchoice==5) //palm endpoint
 		{
 			let desired_palm_key = document.getElementById("custom_palm_key").value.trim();
 			let mdlname = document.getElementById("custom_palm_model").value;
@ -7421,12 +7338,12 @@ Current version: 124

 	}

-	function display_custom_endpoint()
+	function display_endpoint_container()
 	{
 		document.getElementById("customendpointcontainer").classList.remove("hidden");
 		customapi_dropdown();
 	}
-	function dismiss_custom_endpoint()
+	function dismiss_endpoint_container()
 	{
 		document.getElementById("customendpointcontainer").classList.add("hidden");
 	}
@ -7584,9 +7501,8 @@ Current version: 124
 	}

 	//function to allow selection of models
-	function display_models() {
+	function display_horde_models() {
 		document.getElementById("pickedmodel").innerHTML = "";
-		document.getElementById("loadmodelcontainer").classList.remove("hidden");
 		document.getElementById("apikey").value = localsettings.my_api_key;
 		document.getElementById("modelquicksearch").value = "";
 		let manualworker = (document.getElementById("manualworker").checked ? true : false);
@ -7643,7 +7559,6 @@ Current version: 124
 						}
 						model_choices += "<option value=\"" + i + "\" " + alrselected + ">" + clustertag + escapeHtml(curr.name) + " (ETA: "+ curr.eta +"s, Queue: " + curr.queued + ", Speed: " + mperf + ", Qty: " + curr.count + ")</option>";
 					}
-					model_choices += "<option style=\"color:#dd7723;font-weight:bold;\" value=\"9999\">📡 [ Remote Play / Custom API Endpoint ]</option>";
 					document.getElementById("pickedmodel").innerHTML = model_choices;
 				}
 			}
@ -7689,24 +7604,13 @@ Current version: 124
        }
 	}

-	function confirm_models() {
+	function confirm_horde_models() {
 		let selected_idx_arr = Array.from(document.getElementById("pickedmodel").selectedOptions).map(({ value }) => value);

-		if (selected_idx_arr.length == 1 && selected_idx_arr[0] == 9999) //custom endpoint
-		{
-			hide_popups();
-			display_custom_endpoint();
-		} else {
 		custom_kobold_endpoint = "";
 		custom_oai_key = "";
-			custom_scale_key = "";
 		custom_claude_key = "";
 		custom_palm_key = "";
-			//remove the Custom Endpoint if it's multi selected together with others.
-			const findex = selected_idx_arr.indexOf("9999");
-			if (findex > -1) {
-				selected_idx_arr.splice(findex, 1);
-			}

 		if (selected_idx_arr.length > 0) {
 			let prep_sel_models = [];
@ -7765,7 +7669,7 @@ Current version: 124
 				desired_new_home_cluster = null;
 			}

-				document.getElementById("connectstatus").innerHTML = "Connected to KoboldAI Horde";
+			document.getElementById("connectstatus").innerHTML = "Connected to AI Horde";

 			render_gametext();
 			hide_popups();
@ -7774,8 +7678,6 @@ Current version: 124
 			{
 				msgbox("You've selected multiple workers from different clusters. Only one cluster will be used.","Caution");
 			}
-
-			}
 		}
 	}

@ -7902,7 +7804,7 @@ Current version: 124
 							if(uname.toLowerCase()=="anonymous#0")
 							{
 								document.getElementById("kudos_bal").innerHTML = clustertag + uname + "<br>"+
-								"<a class='color_blueurl' href='https://horde.koboldai.net/register'>(Register New User)</a>";
+								"<a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>";
 							}else{
 								document.getElementById("showownworkerslink").classList.remove("hidden");
 							}
@ -7913,12 +7815,12 @@ Current version: 124

 					}
 					else {
-						document.getElementById("kudos_bal").innerHTML = "API Key Error<br><a class='color_blueurl' href='https://horde.koboldai.net/register'>(Register New User)</a>";
+						document.getElementById("kudos_bal").innerHTML = "API Key Error<br><a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>";
 					}
 				}
 				else {
 					console.log("Error: " + errArr);
-					document.getElementById("kudos_bal").innerHTML = "API Key Error<br><a class='color_blueurl' href='https://horde.koboldai.net/register'>(Register New User)</a>";
+					document.getElementById("kudos_bal").innerHTML = "API Key Error<br><a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>";
 				}
 			});
 		}
@ -9131,6 +9033,10 @@ Current version: 124
 		}, true, false, imgres,0.35,true);
 	}

+	function clear_paste_window()
+	{
+		document.getElementById("pasteimgwin").value = "";
+	}
 	function img_paste_event(event)
 	{
 		var items = (event.clipboardData || event.originalEvent.clipboardData).items;
@ -10155,46 +10061,10 @@ Current version: 124
 						msgbox("Error while submitting prompt: " + error);
 					});
 			}
-			else if (custom_scale_key != "")//handle for Scale
-			{
-				let targetep = cors_proxy + "?" + scale_submit_endpoint + custom_scale_ID;
-				let scale_payload = { "input": { "input": submit_payload.prompt } };
-
-				last_request_str = JSON.stringify(scale_payload);
-				fetch(targetep, {
-					method: 'POST',
-					headers: {
-						'Content-Type': 'application/json',
-						'Authorization': 'Basic ' + custom_scale_key,
-					},
-					body: JSON.stringify(scale_payload),
-					referrerPolicy: 'no-referrer',
-				})
-					.then((response) => response.json())
-					.then((data) => {
-						console.log("sync finished response: " + JSON.stringify(data));
-						if (custom_scale_key != "" && data.output != null && data.output != "") {
-							synchro_polled_response = data.output;
-						}
-						else {
-							//error occurred, maybe captcha failed
-							console.error("error occurred in Scale generation");
-							clear_poll_flags();
-							render_gametext();
-							msgbox("Error occurred during text generation: " + formatError(data));
-						}
-					})
-					.catch((error) => {
-						console.error('Error:', error);
-						clear_poll_flags();
-						render_gametext();
-						msgbox("Error while submitting prompt: " + error);
-					});
-			}
 			else if (custom_claude_key != "")//handle for Claude
 			{
 				let claudev3mode = custom_claude_model.toLowerCase().includes("claude-3");
-				let targetep = cors_proxy + "?" + (custom_claude_endpoint + (claudev3mode?claude_submit_endpoint_v3:claude_submit_endpoint));
+				let targetep = apply_proxy_url((custom_claude_endpoint + (claudev3mode?claude_submit_endpoint_v3:claude_submit_endpoint)),true);
 				let claude_payload = null;
 				if(claudev3mode)
 				{
@ -12041,10 +11911,6 @@ Current version: 124
 				{
 					whorun = "<br>You're using the OpenAI API";
 				}
-				else if(custom_scale_key!="")
-				{
-					whorun = "<br>You're using the Spellbook by Scale AI API";
-				}
 				else if(custom_claude_key!="")
 				{
 					whorun = "<br>You're using the Claude API";
@ -12281,8 +12147,8 @@ Current version: 124
 			document.getElementById("fvico").href = favivon_normal;
 		}
 		else if (selected_models.length == 0 && selected_workers.length == 0) {
-			let perfinfo = "There are <span class=\"color_orange\">" + perfdata.worker_count + "</span> total <a class=\"color_green\" href=\"#\" onclick=\"get_and_show_workers()\">volunteer(s)</a> in the KoboldAI Horde, and <span class=\"color_orange\">" + perfdata.queued_requests + "</span> request(s) in queues.<br>A total of <span class=\"color_orange\">" + perfdata.past_minute_tokens + "</span> tokens were generated in the last minute.<br><br>";
-			document.getElementById("gametext").innerHTML = "Welcome to <span class=\"color_cyan\">KoboldAI Lite</span>!<br><br>" + perfinfo + "<a href=\"#\" class=\"color_blueurl\" onclick=\"display_models()\">Please select an AI model to use!</a><br>";
+			let perfinfo = "There are <span class=\"color_orange\">" + perfdata.worker_count + "</span> total <a class=\"color_green\" href=\"#\" onclick=\"get_and_show_workers()\">volunteer(s)</a> in the AI Horde, and <span class=\"color_orange\">" + perfdata.queued_requests + "</span> request(s) in queues.<br>A total of <span class=\"color_orange\">" + perfdata.past_minute_tokens + "</span> tokens were generated in the last minute.<br><br>";
+			document.getElementById("gametext").innerHTML = "Welcome to <span class=\"color_cyan\">KoboldAI Lite</span>!<br><br>" + perfinfo + "<a href=\"#\" class=\"color_blueurl\" onclick=\"display_endpoint_container()\">Please select an AI service to use!</a><br>";
 			document.getElementById("fvico").href = favivon_normal;
 		}
 		else if (pending_response_id == "") {
@ -12595,14 +12461,10 @@ Current version: 124
 		{
 			localsettings.prev_custom_endpoint_type = 2;
 			if(custom_oai_endpoint.toLowerCase().includes("openrouter.ai"))
-			{
-				localsettings.prev_custom_endpoint_type = 6;
-			}
-		}
-		else if(custom_scale_key!="")
 			{
 				localsettings.prev_custom_endpoint_type = 3;
 			}
+		}
 		else if(custom_claude_key!="")
 		{
 			localsettings.prev_custom_endpoint_type = 4;
@ -13553,11 +13415,11 @@ Current version: 124
 								</li>

 								<li class="nav-item hidden" id="topbtn_customendpt">
-									<a class="nav-link" href="#" onclick="display_custom_endpoint()">Custom Endpoint</a>
+									<a class="nav-link" href="#" onclick="display_endpoint_container()">Custom Endpoint</a>
 								</li>

 								<li class="nav-item hidden" id="topbtn_ai">
-									<a class="nav-link" href="#" onclick="display_models()">AI</a>
+									<a class="nav-link" href="#" onclick="display_endpoint_container()">AI</a>
 								</li>

 								<li class="nav-item hidden" id="topbtn_newgame">
@ -13648,7 +13510,7 @@ Current version: 124
 				</div>
 			</div>
 		</div>
-		<div class="lastreq" id="lastreq" style="color:#999999"><span class="color_gray">Avoid sending privacy sensitive information. <a href="#" onclick="explain_horde()">Click here for more info</a>.</span></div>
+		<div class="lastreq" id="lastreq" style="color:#999999"><span class="color_gray">KoboldAI Lite - A frontend for self hosted and third party API services</span></div>
 		</div>

 		<div id="enhancedchatinterface" class="chat_mesgs hidden">
@ -13683,7 +13545,7 @@ Current version: 124
 				</div>
 			</div>

-			<div class="lastreq" id="lastreq2" style="padding-top: 2px; color:#999999"><span class="color_gray">Avoid sending privacy sensitive information. <a href="#" onclick="explain_horde()">Click here for more info</a>.</span></div>
+			<div class="lastreq" id="lastreq2" style="padding-top: 2px; color:#999999"><span class="color_gray">KoboldAI Lite - A frontend for self hosted and third party API services.</span></div>
 		  </div>
 	</div>

@ -13754,30 +13616,41 @@ Current version: 124
 		</div>
 	</div>

-	<div class="popupcontainer flex hidden" id="loadmodelcontainer">
+
+
+	<div class="popupcontainer flex hidden" id="customendpointcontainer">
 		<div class="popupbg flex"></div>
-		<div class="loadpopup">
+		<div class="nspopup flexsize higher">
 			<div class="popuptitlebar">
-				<div class="popuptitletext">Select A Model To Load</div>
+				<div class="popuptitletext">Select your AI provider</div>
+			</div>
+			<div style="padding: 4px;">
+			<select style="padding:4px;" class="form-control" id="customapidropdown" onchange="customapi_dropdown()">
+				<option value="0">AI Horde</option>
+				<option value="1">KoboldAI Remote API</option>
+				<option value="2">OpenAI API</option>
+				<option value="3">OpenRouter API</option>
+				<option value="4">Claude By Anthropic API</option>
+				<option value="5">PaLM/Gemini By Google API</option>
+			</select>
 			</div>

-			<div id="loadmodellistcontent" style="overflow: auto;">
+			<div class="aidgpopuplistheader anotelabel" id="hordeloadmodelcontainer">
+				The AI Horde is a service that generates text using crowdsourced GPUs run by independent volunteer workers. Avoid sending privacy sensitive information. <a href="#" class="color_blueurl" onclick="explain_horde()">Click here for more info</a>
 				<div class="justifyleft anotelabel">
 					<span style="float:left; text-align: left;">
 					Your AI Horde API Key <span class="helpicon">?
-						<span class="helptext">You need an API key to use KoboldAI Horde to generate text. Get one at
-							https://horde.koboldai.net/register or use the anonymous key 0000000000.</span>
+						<span class="helptext">You need an API key to use AI Horde to generate text. Get one at
+							https://aihorde.net/register or use the anonymous key 0000000000.</span>
 					</span>
 					<br><a href="#" id="showownworkerslink" class="color_blueurl hidden" onclick="show_my_own_workers()">[Manage My Workers]</a></span>
 					<span class="color_green" style="float:right; text-align: right;" id="kudos_bal">
-						Need a Key?<br><a class='color_blueurl' href='https://horde.koboldai.net/register'>(Register New User)</a>
+						Need a Key?<br><a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>
 					</span>
 				</div>
-
 				<input class="form-control" type="password" placeholder="Enter API Key (or use 0000000000)" value=""
 					id="apikey" onfocus="focus_api_keys()" onblur="fetch_kudo_balance();blur_api_keys()">

-
 				<div class="justifyleft anotelabel">
 					Select AI Horde Model <span class="helpicon">?
 						<span class="helptext">These are the models currently provided by AI Horde volunteers.</span>
@ -13792,44 +13665,15 @@ Current version: 124
 					Select By Worker <span class="helpicon">?
 						<span class="helptext">This option explicitly assigns worker IDs, fixed based on the current workers available at model selection time.</span>
 					</span>
-					<input type="checkbox" id="manualworker" onclick="display_models()">
+					<input type="checkbox" id="manualworker" onclick="display_endpoint_container()">

 					<span style="float:right;">
 						<input class="settinglabel miniinput" style="margin: 3px; width: 90px;" type="text" placeholder="Quick Search" value="" id="modelquicksearch" oninput="model_quick_search()">
 					</span>
-					<span style="float:right;">
-						<button type="button" style="padding:2px 3px;margin:2px;font-size:11px;" class="bg_orange btn btn-primary" id="btn_cust_endpoint" onclick="display_custom_endpoint()">Use Custom Endpoint</button>
-					</span>
 				</div>

-
-				<div class="popupfooter">
-					<button type="button" class="btn btn-primary" id="btn_loadmodelaccept"
-						onclick="confirm_models()">Ok</button>
-					<button type="button" class="btn btn-primary" id="btn_loadmodelclose"
-						onclick="hide_popups()">Cancel</button>
-
-				</div>
-			</div>
-		</div>
 			</div>

-	<div class="popupcontainer flex hidden" id="customendpointcontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsize higher">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Custom Remote Play Endpoint</div>
-			</div>
-			<div style="padding: 4px;">
-			<select style="padding:4px;" class="form-control" id="customapidropdown" onchange="customapi_dropdown()">
-				<option value="0">KoboldAI Remote Play</option>
-				<option value="1">OpenAI API</option>
-				<option value="2">Spellbook By Scale API</option>
-				<option value="3">Claude By Anthropic API</option>
-				<option value="4">PaLM/Gemini By Google API</option>
-				<option value="5">OpenRouter API</option>
-			</select>
-			</div>
 			<div id="koboldcustom" class="aidgpopuplistheader anotelabel">
 				You can use this to connect to a KoboldAI instance running via a remote tunnel such as <span class="color_orange" style="font-weight: bold;">trycloudflare, localtunnel, ngrok</span>.<br><br>
 				Localhost IPs require host mode enabled. You can use the remote address displayed in the <span class="color_orange" style="font-weight: bold;">remote-play.bat</span> window or <span class="color_orange" style="font-weight: bold;">colab window</span>, note that the model must be loaded first.<br><br>
@ -13840,7 +13684,6 @@ Current version: 124
 					<input type="checkbox" id="remoteconsolelog">
 					<div class="box-label" title="Will display outputs to the remote endpoint's console logs, useful for debugging.">Show Console Logging</div>
 				</div>
-
 			</div>
 			<div id="oaicustom" class="aidgpopuplistheader anotelabel hidden">
 				<span id="oaidesc">
@ -13903,15 +13746,6 @@ Current version: 124
 				</span>

 			</div>
-			<div id="scalecustom" class="aidgpopuplistheader anotelabel hidden">
-				Uses Spellbook by Scale. This is an experimental endpoint. It may break at any time.<br><br>
-				This endpoint does not support custom settings - please configure all properties at the scale webpage. Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature.<br><span class="color_red">Due to CORS limitations, your connection will be proxied.</span><br><br>
-				<span class="color_green" style="font-weight: bold;">Please input Spellbook by Scale API Key.</span><br><br>
-				<input class="form-control" type="text" id="custom_scale_key" placeholder="Spellbook API Key" value=""><br>
-
-				<span class="color_green" style="font-weight: bold;">Please input the Deployment URL provided from the website.</span><br><br>
-				<input class="form-control" type="text" id="custom_scale_ID" placeholder="https://dashboard.scale.com/spellbook/api/v2/deploy/a12345b" value="" ><br>
-			</div>
 			<div id="claudecustom" class="aidgpopuplistheader anotelabel hidden">
 				Entering your Claude API key will allow you to use KoboldAI Lite with their API.<br><br>
 				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature.<br>Only Temperature, Top-P and Top-K samplers are used.<br><br>
@ -13956,7 +13790,7 @@ Current version: 124
 			</div>
 			<div class="popupfooter">
 				<button type="button" class="btn btn-primary" onclick="connect_custom_endpoint()">Ok</button>
-				<button type="button" class="btn btn-primary" onclick="dismiss_custom_endpoint()">Cancel</button>
+				<button type="button" class="btn btn-primary" onclick="dismiss_endpoint_container()">Cancel</button>
 			</div>
 		</div>
 	</div>
@ -14921,7 +14755,7 @@ Current version: 124
 			<div class="popuptitlebar">
 				<div class="popuptitletext">Paste Image From Clipboard</div>
 			</div>
-			<input type="text" style="width:100%; height:100px; text-align: center;" readonly="true" onpaste="return img_paste_event(event)" value="" placeholder="[Paste Image Here]">
+			<input type="text" id="pasteimgwin" style="width:100%; height:100px; text-align: center;" oninput="clear_paste_window()" onpaste="return img_paste_event(event)" value="" placeholder="[Paste Image Here]">
 			<br>
 			<div class="popupfooter">
 				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
--- a/llama.cpp
+++ b/llama.cpp
@ -238,6 +238,7 @@ enum llm_arch {
    LLM_ARCH_GEMMA,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
+    LLM_ARCH_COMMAND_R,
    LLM_ARCH_UNKNOWN,
 };

@ -267,6 +268,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_GEMMA,           "gemma"      },
    { LLM_ARCH_STARCODER2,      "starcoder2" },
    { LLM_ARCH_MAMBA,           "mamba"      },
+    { LLM_ARCH_COMMAND_R,       "command-r"  },
    { LLM_ARCH_UNKNOWN,         "(unknown)"  },
 };

@ -292,6 +294,7 @@ enum llm_kv {
    LLM_KV_EXPERT_COUNT,
    LLM_KV_EXPERT_USED_COUNT,
    LLM_KV_POOLING_TYPE,
+    LLM_KV_LOGIT_SCALE,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -356,6 +359,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
+    { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },

    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@ -862,6 +866,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
        },
    },
+    {
+        LLM_ARCH_COMMAND_R,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -1625,6 +1644,7 @@ enum e_model {
    MODEL_20B,
    MODEL_30B,
    MODEL_34B,
+    MODEL_35B,
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
@ -1671,6 +1691,7 @@ struct llama_hparams {

    float f_clamp_kqv      = 0.0f;
    float f_max_alibi_bias = 0.0f;
+    float f_logit_scale    = 0.0f;

    bool causal_attn = true;
    bool need_kq_pos = false;
@ -1901,6 +1922,31 @@ struct llama_kv_cache {
    }
 };

+struct llama_control_vector {
+    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+
+    ggml_tensor * tensor_for(int il) const {
+        if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+            return nullptr;
+        }
+        return tensors[il];
+    }
+
+    ~llama_control_vector() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
@ -2119,6 +2165,9 @@ struct llama_context {
    struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
    struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]

+    // control vectors
+    struct llama_control_vector cvec;
+
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
 #endif
@ -3277,6 +3326,7 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_20B:    return "20B";
        case MODEL_30B:    return "30B";
        case MODEL_34B:    return "34B";
+        case MODEL_35B:    return "35B";
        case MODEL_40B:    return "40B";
        case MODEL_65B:    return "65B";
        case MODEL_70B:    return "70B";
@ -3669,6 +3719,15 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 40: model.type = e_model::MODEL_35B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
        default: (void)0;
    }

@ -4009,6 +4068,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+    LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
    LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
    LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
    LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
@ -4990,6 +5050,37 @@ static bool llm_load_tensors(
                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
                    }
                } break;
+            case LLM_ARCH_COMMAND_R:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        // init output from the input tok embed
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -5136,6 +5227,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
        }
 #endif

+#ifdef GGML_USE_SYCL
+        if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+            ggml_backend_sycl_set_single_device_mode(params.main_gpu);
+            //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
+            params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
+        } else {
+            ggml_backend_sycl_set_mul_device_mode();
+        }
+#endif
+
        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
@ -5930,6 +6031,12 @@ struct llm_build_context {
            }

            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+            if (layer_dir != nullptr) {
+                cur = ggml_add(ctx0, cur, layer_dir);
+            }
            cb(cur, "l_out", il);

            // input for next layer
@ -5965,7 +6072,7 @@ struct llm_build_context {
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
+        struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@ -6015,7 +6122,6 @@ struct llm_build_context {
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);

-
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
@ -8377,6 +8483,121 @@ struct llm_build_context {

        return gf;
    }
+
+    struct ggml_cgraph * build_command_r() {
+
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, ffn_inp,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+
+    }
 };

 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -8559,6 +8780,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_mamba();
            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                result = llm.build_command_r();
+            } break;
        default:
            GGML_ASSERT(false);
    }
@ -13228,23 +13453,22 @@ struct llama_context * llama_new_context_with_model(
        if (model->n_gpu_layers > 0) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-                int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
-                ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
+                ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
+                    int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
                    llama_free(ctx);
                    return nullptr;
                }
                ctx->backends.push_back(backend);
            } else {
                // LLAMA_SPLIT_LAYER requires a backend for each GPU
-                int id_list[GGML_SYCL_MAX_DEVICES];
-                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
                for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                    int device_id = id_list[i];
                    ggml_backend_t backend = ggml_backend_sycl_init(i);
                    if (backend == nullptr) {
-                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
+                        int id_list[GGML_SYCL_MAX_DEVICES];
+                        ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
+                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
                        llama_free(ctx);
                        return nullptr;
                    }
@ -13445,6 +13669,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_COMMAND_R:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
@ -13481,6 +13706,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }

+int32_t llama_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
 }
@ -13580,6 +13809,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
    }
 }

+static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+    GGML_ASSERT(cvec.tensors.empty());
+    GGML_ASSERT(cvec.ctxs.empty());
+    GGML_ASSERT(cvec.bufs.empty());
+
+    // count layer buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
+        buft_layer_count[model.buft_layer[i].buft]++;
+    }
+
+    // allocate contexts
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (auto & it : buft_layer_count) {
+        int n_layers = it.second;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ n_layers * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return 1;
+        }
+        ctx_map[it.first] = ctx;
+    }
+
+    // make tensors
+    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
+        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+        cvec.tensors.push_back(tensor);
+    }
+
+    // allocate tensors / buffers and zero
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        cvec.ctxs.push_back(ctx);
+        cvec.bufs.push_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
+    const llama_model & model = lctx->model;
+    llama_control_vector & cvec = lctx->cvec;
+
+    if (data == nullptr) {
+        // disable the current control vector (but leave allocated for later)
+        cvec.layer_start = -1;
+        cvec.layer_end   = -1;
+        return 0;
+    }
+
+    if (n_embd != (int) model.hparams.n_embd) {
+        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        return 1;
+    }
+
+    if (cvec.tensors.empty()) {
+        if (!llama_control_vector_init(cvec, model)) {
+            return 1;
+        }
+    }
+
+    cvec.layer_start = il_start;
+    cvec.layer_end   = il_end;
+
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        assert(cvec.tensors[il] != nullptr);
+
+        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+        if (off + n_embd <= len) {
+            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+        }
+    }
+
+    return 0;
+}
+
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
    struct llama_kv_cache_view result = {
        /*.n_cells            = */ 0,
@ -14564,6 +14883,26 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<start_of_turn>model\n";
        }
+    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+        // OrionStarAI/Orion-14B-Chat
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message support, we will merge it with user prompt
+                system_prompt = message->content;
+                continue;
+            } else if (role == "user") {
+                ss << "Human: ";
+                if (!system_prompt.empty()) {
+                    ss << system_prompt << "\n\n";
+                    system_prompt = "";
+                }
+                ss << message->content << "\n\nAssistant: </s>";
+            } else {
+                ss << message->content << "</s>";
+            }
+        }
    } else {
        // template not supported
        return -1;
--- a/llama.h
+++ b/llama.h
@ -388,6 +388,7 @@ extern "C" {
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -440,6 +441,20 @@ extern "C" {
                          const char * path_base_model,
                             int32_t   n_threads);

+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
+
    //
    // KV cache
    //
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -31,6 +31,8 @@ int main(void) {
        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
        // google/gemma-7b-it
        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+        // OrionStarAI/Orion-14B-Chat
+        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -45,6 +47,8 @@ int main(void) {
        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
        // google/gemma-7b-it
        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        // OrionStarAI/Orion-14B-Chat
+        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;