examples : add model conversion tool/example (#15455)

* examples : add model conversion tool/example This commit adds an "example/tool" that is intended to help in the process of converting models to GGUF. Currently it supports normal causal models and embedding models. The readme contains instructions and command to guide through the process. The motivation for this to have a structured and repeatable process for model conversions and hopefully with time improve upon it to make the process easier and more reliable. We have started to use this for new model conversions internally and will continue doing so and improve it as we go along. Perhaps with time this should be placed in a different directory than the examples directory, but for now it seems like a good place to keep it while we are still developing it. * squash! examples : add model conversion tool/example Remove dependency on scikit-learn in model conversion example. * squash! examples : add model conversion tool/example Update transformer dep to use non-dev version. And also import `AutoModelForCausalLM` instead of `AutoModel` to ensure compatibility with the latest version. * squash! examples : add model conversion tool/example Remove the logits requirements file from the all requirements file.
2025-09-10 17:14:36 +00:00 · 2025-08-21 12:16:54 +02:00 · 2025-08-21 12:16:54 +02:00 · 2758fa10da
commit 2758fa10da
parent b108e42904
33 changed files with 2230 additions and 0 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,6 +34,7 @@ else()
    add_subdirectory(gen-docs)
    add_subdirectory(training)
    add_subdirectory(diffusion)
    add_subdirectory(model-conversion)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/model-conversion/.gitignore
+++ b/examples/model-conversion/.gitignore
@ -0,0 +1,3 @@
 .model_name
 data
 ppl
--- a/examples/model-conversion/CMakeLists.txt
+++ b/examples/model-conversion/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-logits)
 add_executable(${TARGET} logits.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@ -0,0 +1,163 @@
 # Validation functions
 define validate_model_path
 	@if [ -z "$(MODEL_PATH)" ]; then \
 		echo "Error: MODEL_PATH must be provided either as:"; \
 		echo "  1. Environment variable: export MODEL_PATH=/path/to/model"; \
 		echo "  2. Command line argument: make $(1) MODEL_PATH=/path/to/model"; \
 		exit 1; \
 	fi
 endef
 define validate_embedding_model_path
 	@if [ -z "$(EMBEDDING_MODEL_PATH)" ]; then \
 		echo "Error: EMBEDDING_MODEL_PATH must be provided either as:"; \
 		echo "  1. Environment variable: export EMBEDDING_MODEL_PATH=/path/to/model"; \
 		echo "  2. Command line argument: make $(1) EMBEDDING_MODEL_PATH=/path/to/model"; \
 		exit 1; \
 	fi
 endef
 ###
 ### Casual Model targets/recipes
 ###
 causal-convert-model-bf16: OUTTYPE=bf16
 causal-convert-model-bf16: causal-convert-model
 causal-convert-model:
 	$(call validate_model_path,causal-convert-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh
 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
 causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
 causal-verify-logits: causal-run-original-model causal-run-converted-model
 	@./scripts/causal/compare-logits.py
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
 causal-run-original-embeddings:
 	@./scripts/causal/run-casual-gen-embeddings-org.sh
 causal-run-converted-embeddings:
 	@./scripts/causal/run-converted-model-embeddings-logits.sh
 causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-embeddings
 	@./scripts/causal/compare-embeddings-logits.sh
 causal-inspect-original-model:
 	@./scripts/utils/inspect-org-model.py
 causal-inspect-converted-model:
 	@./scripts/utils/inspect-converted-model.sh
 causal-start-embedding-server:
 	@./scripts/utils/run-embedding-server.sh ${CONVERTED_MODEL}
 causal-curl-embedding-endpoint: causal-run-original-embeddings
 	@./scripts/utils/curl-embedding-server.sh | ./scripts/causal/compare-embeddings-logits.sh
 causal-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
 causal-quantize-Q8_0: causal-quantize-model
 causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
 causal-quantize-Q4_0: causal-quantize-model
 causal-quantize-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
 	@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
 causal-run-quantized-model:
 	@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
 ###
 ### Embedding Model targets/recipes
 ###
 embedding-convert-model-bf16: OUTTYPE=bf16
 embedding-convert-model-bf16: embedding-convert-model
 embedding-convert-model:
 	$(call validate_embedding_model_path,embedding-convert-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/embedding/convert-model.sh
 embedding-run-original-model:
 	$(call validate_embedding_model_path,embedding-run-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
 embedding-run-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh
 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
 embedding-inspect-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-start-embedding-server:
 	@./scripts/utils/run-embedding-server.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-curl-embedding-endpoint:
 	@./scripts/utils/curl-embedding-server.sh | ./scripts/embedding/compare-embeddings-logits.sh
 embedding-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
 embedding-quantize-Q8_0: embedding-quantize-model
 embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
 embedding-quantize-Q4_0: embedding-quantize-model
 embedding-quantize-model:
 	@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
 	@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
 embedding-run-quantized-model:
 	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
 ###
 ### Perplexity targets/recipes
 ###
 perplexity-data-gen:
 	CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/utils/perplexity-gen.sh
 perplexity-run-full:
 	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" LOOGITS_FILE="$(LOGITS_FILE)" \
 	./scripts/utils/perplexity-run.sh
 perplexity-run:
 	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/utils/perplexity-run-simple.sh
 ###
 ### HuggingFace targets/recipes
 ###
 hf-create-model:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"
 hf-create-model-private:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p
 hf-upload-gguf-to-model:
 	@./scripts/utils/hf-upload-gguf-model.py -m "${MODEL_PATH}" -r "${REPO_ID}" -o "${NAME_IN_REPO}"
 hf-create-collection:
 	@./scripts/utils/hf-create-collection.py -n "${NAME}" -d "${DESCRIPTION}" -ns "${NAMESPACE}"
 hf-add-model-to-collection:
 	@./scripts/utils/hf-add-model-to-collection.py -c "${COLLECTION}" -m "${MODEL}"
 .PHONY: clean
 clean:
 	@${RM} -rf data .converted_embedding_model.txt .converted_model.txt .embedding_model_name.txt .model_name.txt
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -0,0 +1,335 @@
 # Model Conversion Example
 This directory contains scripts and code to help in the process of converting
 HuggingFace PyTorch models to GGUF format.
 The motivation for having this is that the conversion process can often be an
 iterative process, where the original model is inspected, converted, updates
 made to llama.cpp, converted again, etc. Once the model has been converted it
 needs to be verified against the original model, and then optionally quantified,
 and is some cases perplexity checked of the quantized model. And finally the
 model/models need to the ggml-org on Hugging Face. This tool/example tries to
 help with this process.
 ### Overview
 The idea is that the makefile targets and scripts here can be used in the
 development/conversion process assisting with things like:
 * inspect/run the original model to figure out how it works
 * convert the original model to GGUF format
 * inspect/run the converted model
 * verify the logits produced by the original model and the converted model
 * quantize the model to GGUF format
 * run perplexity evaluation to verify that the quantized model is performing
  as expected
 * upload the model to HuggingFace to make it available for others
 ## Setup
 Create virtual python environment
 ```console
 $ python3.11 -m venv venv
 $ source venv/bin/activate
 (venv) $ pip install -r requirements.txt
 ```
 ## Causal Language Model Conversion
 This section describes the steps to convert a causal language model to GGUF and
 to verify that the conversion was successful.
 ### Download the original model
 First, clone the original model to some local directory:
 ```console
 $ mkdir models && cd models
 $ git clone https://huggingface.co/user/model_name
 $ cd model_name
 $ git lfs install
 $ git lfs pull
 ```
 ### Set the MODEL_PATH
 The path to the downloaded model can be provided in two ways:
 **Option 1: Environment variable (recommended for iterative development)**
 ```console
 export MODEL_PATH=~/work/ai/models/some_model
 ```
 **Option 2: Command line argument (for one-off tasks)**
 ```console
 make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
 ```
 Command line arguments take precedence over environment variables when both are provided.
 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
 will the cause the transformer implementation to be loaded explicitely and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
 ```
 ### Inspecting the original tensors
 ```console
 # Using environment variable
 (venv) $ make causal-inspect-original-model
 # Or using command line argument
 (venv) $ make causal-inspect-original-model MODEL_PATH=~/work/ai/models/some_model
 ```
 ### Running the original model
 This is mainly to verify that the original model works, and to compare the output
 from the converted model.
 ```console
 # Using environment variable
 (venv) $ make causal-run-original-model
 # Or using command line argument
 (venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
 ```
 This command will save two file to the `data` directory, one is a binary file
 containing logits which will be used for comparison with the converted model
 later, and the other is a text file which allows for manual visual inspection.
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model, the model can be converted to GGUF format using the following command:
 ```console
 # Using environment variable
 (venv) $ make causal-convert-model
 # Or using command line argument
 (venv) $ make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
 ```
 ### Inspecting the converted model
 The converted model can be inspected using the following command:
 ```console
 (venv) $ make inspect-converted-model
 ```
 ### Running the converted model
 ```console
 (venv) $ make run-converted-model
 ```
 ### Model logits verfication
 The following target will run the original model and the converted model and
 compare the logits:
 ```console
 (venv) $ make causal-verify-logits
 ```
 ### Quantizing the model
 The causal model can be quantized to GGUF format using the following command:
 ```console
 (venv) $ make causal-quantize-Q8_0
 Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
 be used set the `QUANTIZED_MODEL` environment variable:
 ```console
 export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
 The the quantized model can be run using the following command:
 ```console
 (venv) $ make causal-run-quantized-model
 ```
 ## Embedding Language Model Conversion
 ### Download the original model
 ```console
 $ mkdir models && cd models
 $ git clone https://huggingface.co/user/model_name
 $ cd model_name
 $ git lfs install
 $ git lfs pull
 ```
 The path to the embedding model can be provided in two ways:
 **Option 1: Environment variable (recommended for iterative development)**
 ```console
 export EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 **Option 2: Command line argument (for one-off tasks)**
 ```console
 make embedding-convert-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 Command line arguments take precedence over environment variables when both are provided.
 ### Running the original model
 This is mainly to verify that the original model works and to compare the output
 with the output from the converted model.
 ```console
 # Using environment variable
 (venv) $ make embedding-run-original-model
 # Or using command line argument
 (venv) $ make embedding-run-original-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 This command will save two files to the `data` directory, one is a binary
 file containing logits which will be used for comparison with the converted
 model, and the other is a text file which allows for manual visual inspection.
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
 ```console
 (venv) $ make embedding-convert-model
 ```
 ### Run the converted model
 ```console
 (venv) $ make embedding-run-converted-model
 ```
 ### Model logits verfication
 The following target will run the original model and the converted model (which
 was done manually in the previous steps) and compare the logits:
 ```console
 (venv) $ make embedding-verify-logits
 ```
 ### llama-server verification
 To verify that the converted model works with llama-server, the following
 command can be used:
 ```console
 (venv) $ make embedding-start-embedding-server
 ```
 Then open another terminal and set the `EMBEDDINGS_MODEL_PATH` environment
 variable as this will not be inherited by the new terminal:
 ```console
 (venv) $ make embedding-curl-embedding-endpoint
 ```
 This will call the `embedding` endpoing and the output will be piped into
 the same verification script as used by the target `embedding-verify-logits`.
 The causal model can also be used to produce embeddings and this can be verified
 using the following commands:
 ```console
 (venv) $ make causal-start-embedding-server
 ```
 Then open another terminal and set the `MODEL_PATH` environment
 variable as this will not be inherited by the new terminal:
 ```console
 (venv) $ make casual-curl-embedding-endpoint
 ```
 ### Quantizing the model
 The embedding model can be quantized to GGUF format using the following command:
 ```console
 (venv) $ make embedding-quantize-Q8_0
 Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
 be used set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
 ```console
 export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
 The the quantized model can be run using the following command:
 ```console
 (venv) $ make embedding-run-quantized-model
 ```
 ## Perplexity Evaluation
 ### Simple perplexity evaluation
 This allows to run the perplexity evaluation without having to generate a
 token/logits file:
 ```console
 (venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
 ```
 This will use the wikitext dataset to run the perplexity evaluation and and
 output the perplexity score to the terminal. This value can then be compared
 with the perplexity score of the unquantized model.
 ### Full perplexity evaluation
 First use the converted, non-quantized, model to generate the perplexity evaluation
 dataset using the following command:
 ```console
 $ make perplexity-data-gen CONVERTED_MODEL=~/path/to/converted/model.gguf
 ```
 This will generate a file in the `data` directory named after the model and with
 a `.kld` suffix which contains the tokens and the logits for the wikitext dataset.
 After the dataset has been generated, the perplexity evaluation can be run using
 the quantized model:
 ```console
 $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LOGITS_FILE=data/model.gguf.ppl
 ```
 > 📝 **Note:** The `LOGITS_FILE` is the file generated by the previous command
 > can be very large, so make sure you have enough disk space available.
 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
 on Hugging Face in the the ggml-org. These can be used when preparing a relase
 to script the process for new model releases.
 For the following targets a `HF_TOKEN` environment variable is required.
 > 📝 **Note:** Don't forget to logout from Hugging Face after running these
 > commands, otherwise you might have issues pulling/cloning repositories as
 > the token will still be in use:
 > $ huggingface-cli logout
 > $ unset HF_TOKEN
 ### Create a new Hugging Face Model (model repository)
 This will create a new model repsository on Hugging Face with the specified
 model name.
 ```console
 (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev"
 Repository ID:  danbev/TestModel-GGUF
 Repository created: https://huggingface.co/danbev/TestModel-GGUF
 ```
 Note that we append a `-GGUF` suffix to the model name to ensure a consistent
 naming convention for GGUF models.
 ### Upload a GGUF model to model repository
 The following target uploads a model to an existing Hugging Face model repository.
 ```console
 (venv) $ make hf-upload-gguf-to-model MODEL_PATH=dummy-model1.gguf REPO_ID=danbev/TestModel-GGUF
 📤 Uploading dummy-model1.gguf to danbev/TestModel-GGUF/dummy-model1.gguf
 ✅ Upload successful!
 🔗 File available at: https://huggingface.co/danbev/TestModel-GGUF/blob/main/dummy-model1.gguf
 ```
 This command can also be used to update an existing model file in a repository.
 ### Create a new Collection
 ```console
 (venv) $ make hf-new-collection NAME=TestCollection DESCRIPTION="Collection for testing scripts" NAMESPACE=danbev
 🚀 Creating Hugging Face Collection
 Title: TestCollection
 Description: Collection for testing scripts
 Namespace: danbev
 Private: False
 ✅ Authenticated as: danbev
 📚 Creating collection: 'TestCollection'...
 ✅ Collection created successfully!
 📋 Collection slug: danbev/testcollection-68930fcf73eb3fc200b9956d
 🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
 🎉 Collection created successfully!
 Use this slug to add models: danbev/testcollection-68930fcf73eb3fc200b9956d
 ```
 ### Add model to a Collection
 ```console
 (venv) $ make hf-add-model-to-collection COLLECTION=danbev/testcollection-68930fcf73eb3fc200b9956d MODEL=danbev/TestModel-GGUF
 ✅ Authenticated as: danbev
 🔍 Checking if model exists: danbev/TestModel-GGUF
 ✅ Model found: danbev/TestModel-GGUF
 📚 Adding model to collection...
 ✅ Model added to collection successfully!
 🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
 🎉 Model added successfully!
 ```
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@ -0,0 +1,209 @@
 #include "llama.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <vector>
 #include <ctype.h>
 #include <filesystem>
 static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]);
    printf("\n");
 }
 int main(int argc, char ** argv) {
    std::string model_path;
    std::string prompt = "Hello, my name is";
    int ngl = 0;
    bool embedding_mode = false;
    {
        int i = 1;
        for (; i < argc; i++) {
            if (strcmp(argv[i], "-m") == 0) {
                if (i + 1 < argc) {
                    model_path = argv[++i];
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-ngl") == 0) {
                if (i + 1 < argc) {
                    try {
                        ngl = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-embd-mode") == 0) {
                if (i + 1 < argc) {
                    try {
                        embedding_mode = true;
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else {
                // prompt starts here
                break;
            }
        }
        if (model_path.empty()) {
            print_usage(argc, argv);
            return 1;
        }
        if (i < argc) {
            prompt = argv[i++];
            for (; i < argc; i++) {
                prompt += " ";
                prompt += argv[i];
            }
        }
    }
    ggml_backend_load_all();
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    // Extract basename from model_path
    const char * basename = strrchr(model_path.c_str(), '/');
    basename = (basename == NULL) ? model_path.c_str() : basename + 1;
    char model_name[256];
    strncpy(model_name, basename, 255);
    model_name[255] = '\0';
    char * dot = strrchr(model_name, '.');
    if (dot != NULL && strcmp(dot, ".gguf") == 0) {
        *dot = '\0';
    }
    printf("Model name: %s\n", model_name);
    const llama_vocab * vocab = llama_model_get_vocab(model);
    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
    std::vector<llama_token> prompt_tokens(n_prompt);
    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = n_prompt;
    ctx_params.n_batch = n_prompt;
    ctx_params.no_perf = false;
    if (embedding_mode) {
        ctx_params.embeddings = true;
        ctx_params.n_ubatch = ctx_params.n_batch;
    }
    llama_context * ctx = llama_init_from_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    printf("Input prompt: \"%s\"\n", prompt.c_str());
    printf("Tokenized prompt (%d tokens): ", n_prompt);
    for (auto id : prompt_tokens) {
        char buf[128];
        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
        }
        std::string s(buf, n);
        printf("%s", s.c_str());
    }
    printf("\n");
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
    if (llama_decode(ctx, batch)) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return 1;
    }
    float * logits;
    int n_logits;
    const char * type;
    if (embedding_mode) {
        logits = llama_get_embeddings(ctx);
        n_logits = llama_model_n_embd(model) * batch.n_tokens;
        type = "-embeddings";
        printf("Embeddings size: %d\n", n_logits);
    } else {
        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
        n_logits = llama_vocab_n_tokens(vocab);
        type = "";
        printf("Vocab size: %d\n", n_logits);
    }
    std::filesystem::create_directory("data");
    // Save logits to binary file
    char bin_filename[512];
    snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
    printf("Saving logits to %s\n", bin_filename);
    FILE * f = fopen(bin_filename, "wb");
    if (f == NULL) {
        fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
        return 1;
    }
    fwrite(logits, sizeof(float), n_logits, f);
    fclose(f);
    // Also save as text for debugging
    char txt_filename[512];
    snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
    f = fopen(txt_filename, "w");
    if (f == NULL) {
        fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
        return 1;
    }
    for (int i = 0; i < n_logits; i++) {
        fprintf(f, "%d: %.6f\n", i, logits[i]);  // Added index and changed format
    }
    fclose(f);
    // Print first and last 10 logits for quick verification
    printf("First 10 logits: ");
    for (int i = 0; i < 10 && i < n_logits; i++) {
        printf("%.6f ", logits[i]);
    }
    printf("\n");
    printf("Last 10 logits: ");
    for (int i = n_logits - 10; i < n_logits; i++) {
        if (i >= 0) printf("%.6f ", logits[i]);
    }
    printf("\n\n");
    printf("Logits saved to %s\n", bin_filename);
    printf("Logits saved to %s\n", txt_filename);
    llama_free(ctx);
    llama_model_free(model);
    return 0;
 }
--- a/examples/model-conversion/requirements.txt
+++ b/examples/model-conversion/requirements.txt
@ -0,0 +1,4 @@
 torch~=2.6.0
 torchvision~=0.21.0
 transformers~=4.55.0
 huggingface-hub~=0.34.0
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@ -0,0 +1,43 @@
 #/bin/bash
 set -e
 MODEL_PATH="${1:-"$MODEL_PATH"}"
 MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
 else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
    python3 -c "
 import json
 import sys
 import struct
 data = json.load(sys.stdin)
 # Flatten all embeddings completely
 flattened = []
 for item in data:
    embedding = item['embedding']
    for token_embedding in embedding:
        flattened.extend(token_embedding)
 print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
 # Write as binary floats - matches logitc.cpp fwrite format
 with open('$TEMP_FILE', 'wb') as f:
    for value in flattened:
        f.write(struct.pack('f', value))
 "
    CPP_EMBEDDINGS="$TEMP_FILE"
    trap "rm -f $TEMP_FILE" EXIT
 fi
 python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
    --cpp-embeddings $CPP_EMBEDDINGS \
    --prompt "Hello world today" \
    --causal
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -0,0 +1,88 @@
 #!/usr/bin/env python3
 import numpy as np
 import sys
 import os
 from pathlib import Path
 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
    try:
        pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
        llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
    except Exception as e:
        print(f"❌ NOK: Failed to load files - {e}")
        return False
    # Check shapes match
    if pytorch_logits.shape != llamacpp_logits.shape:
        print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
        return False
    # Calculate key metrics
    diff = pytorch_logits - llamacpp_logits
    abs_diff = np.abs(diff)
    max_diff = np.max(abs_diff)
    # Get top 10 predictions from both models
    pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
    llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
    print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")
    if max_diff > 1.0:
        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
        return False
    return True
 def main():
    model_path = os.getenv('MODEL_PATH')
    if not model_path:
        print("Error: MODEL_PATH environment variable not set")
        sys.exit(1)
    if not os.path.exists(model_path):
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)
    model_name = os.path.splitext(os.path.basename(model_path))[0]
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
    if not pytorch_file.exists():
        print(f"Error: PyTorch logits file not found: {pytorch_file}")
        print("Please run scripts/run-org-model.sh first to generate this file.")
        sys.exit(1)
    if not llamacpp_file.exists():
        print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
        print("Please run scripts/run-converted-model.sh first to generate this file.")
        sys.exit(1)
    print("Checked all required files were found. Proceeding...\n")
    print("🔍 GGML Model Validation for model ", model_name)
    print("=" * 40)
    print(f"PyTorch logits  : {pytorch_file}")
    print(f"llama.cpp logits: {llamacpp_file}")
    print()
    success = quick_logits_check(pytorch_file, llamacpp_file)
    # Exit with appropriate code
    if success:
        print("✅ OK: Lightweight model check successful!")
        print("       Ok to proceed with NMSE check...")
        sys.exit(0)
    else:
        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
 CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
 echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 echo "Metadata override: ${METADATA_OVERRIDE}"
 python ../../convert_hf_to_gguf.py --verbose \
    ${MODEL_PATH} \
    --outfile ${CONVERTED_MODEL} \
    --outtype ${TYPE} \
    --metadata "${METADATA_OVERRIDE}"
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.sh
@ -0,0 +1,113 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import importlib
 import sys
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM
 from pathlib import Path
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 config = AutoConfig.from_pretrained(model_path)
 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
 print("Hidden size:      ", config.hidden_size)
 print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)
 print("Loading model and tokenizer using AutoTokenizer:", model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}ForCausalLM"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
 else:
    model = AutoModelForCausalLM.from_pretrained(model_path)
 print(f"Model class: {type(model)}")
 #print(f"Model file: {type(model).__module__}")
 model_name = os.path.basename(model_path)
 print(f"Model name: {model_name}")
 prompt = "Hello world today"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 with torch.no_grad():
    outputs = model(input_ids, output_hidden_states=True)
    # Extract hidden states from the last layer
    # outputs.hidden_states is a tuple of (num_layers + 1) tensors
    # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
    last_hidden_states = outputs.hidden_states[-1]
    # Get embeddings for all tokens
    token_embeddings = last_hidden_states[0].cpu().numpy()  # Remove batch dimension
    print(f"Hidden states shape: {last_hidden_states.shape}")
    print(f"Token embeddings shape: {token_embeddings.shape}")
    print(f"Hidden dimension: {token_embeddings.shape[-1]}")
    print(f"Number of tokens: {token_embeddings.shape[0]}")
    # Save raw token embeddings
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
    # Save all token embeddings as binary
    print(token_embeddings)
    token_embeddings.astype(np.float32).tofile(bin_filename)
    # Save as text for inspection
    with open(txt_filename, "w") as f:
        for i, embedding in enumerate(token_embeddings):
            for j, val in enumerate(embedding):
                f.write(f"{i} {j} {val:.6f}\n")
    # Print embeddings per token in the requested format
    print("\nToken embeddings:")
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    for i, embedding in enumerate(token_embeddings):
        # Format: show first few values, ..., then last few values
        if len(embedding) > 10:
            # Show first 3 and last 3 values with ... in between
            first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
            last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
            print(f"embedding {i}: {first_vals}  ... {last_vals}")
        else:
            # If embedding is short, show all values
            vals = " ".join(f"{val:8.6f}" for val in embedding)
            print(f"embedding {i}: {vals}")
    # Also show token info for reference
    print(f"\nToken reference:")
    for i, token in enumerate(tokens):
        print(f"  Token {i}: {repr(token)}")
    print(f"Saved bin logits to: {bin_filename}")
    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import importlib
 from pathlib import Path
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import torch
 import numpy as np
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 config = AutoConfig.from_pretrained(model_path)
 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
 print("Hidden size:      ", config.hidden_size)
 print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)
 print("Loading model and tokenizer using AutoTokenizer:", model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 config = AutoConfig.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}ForCausalLM"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
    model = AutoModelForCausalLM.from_pretrained(model_path)
 model_name = os.path.basename(model_path)
 # Printing the Model class to allow for easier debugging. This can be useful
 # when working with models that have not been publicly released yet and this
 # migth require that the concrete class is imported and used directly instead
 # of using AutoModelForCausalLM.
 print(f"Model class: {model.__class__.__name__}")
 prompt = "Hello, my name is"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits
    # Extract logits for the last token (next token prediction)
    last_logits = logits[0, -1, :].cpu().numpy()
    print(f"Logits shape: {logits.shape}")
    print(f"Last token logits shape: {last_logits.shape}")
    print(f"Vocab size: {len(last_logits)}")
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}.bin"
    txt_filename = data_dir / f"pytorch-{model_name}.txt"
    # Save to file for comparison
    last_logits.astype(np.float32).tofile(bin_filename)
    # Also save as text file for easy inspection
    with open(txt_filename, "w") as f:
        for i, logit in enumerate(last_logits):
            f.write(f"{i}: {logit:.6f}\n")
    # Print some sample logits for quick verification
    print(f"First 10 logits: {last_logits[:10]}")
    print(f"Last 10 logits: {last_logits[-10:]}")
    # Show top 5 predicted tokens
    top_indices = np.argsort(last_logits)[-5:][::-1]
    print("Top 5 predictions:")
    for idx in top_indices:
        token = tokenizer.decode([idx])
        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
    print(f"Saved bin logits to: {bin_filename}")
    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@ -0,0 +1,42 @@
 #/bin/bash
 set -e
 MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
 MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
 else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
    python3 -c "
 import json
 import sys
 import struct
 data = json.load(sys.stdin)
 # Flatten all embeddings completely
 flattened = []
 for item in data:
    embedding = item['embedding']
    for token_embedding in embedding:
        flattened.extend(token_embedding)
 print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
 # Write as binary floats - matches logitc.cpp fwrite format
 with open('$TEMP_FILE', 'wb') as f:
    for value in flattened:
        f.write(struct.pack('f', value))
 "
    CPP_EMBEDDINGS="$TEMP_FILE"
    trap "rm -f $TEMP_FILE" EXIT
 fi
 python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
    --cpp-embeddings $CPP_EMBEDDINGS \
    --prompt "Hello world today"
--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 set -e
 MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
 CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
 echo "Model path: ${EMBEDDING_MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
 echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 python ../../convert_hf_to_gguf.py --verbose \
    ${EMBEDDING_MODEL_PATH} \
    --outfile ${CONVERTED_MODEL} \
    --outtype ${TYPE}
 echo ""
 echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
 echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@ -0,0 +1,116 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import numpy as np
 import importlib
 from pathlib import Path
 from transformers import AutoTokenizer, AutoConfig, AutoModel
 import torch
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}Model"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
    model = AutoModel.from_pretrained(model_path)
 print(f"Model class: {type(model)}")
 #print(f"Model file: {type(model).__module__}")
 config = AutoConfig.from_pretrained(model_path)
 model_name = os.path.basename(model_path)
 texts = [ "Hello world today" ]
 encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
 )
 tokens = encoded['input_ids'][0]
 token_strings = tokenizer.convert_ids_to_tokens(tokens)
 for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
    print(f"{token_id:6d} -> '{token_str}'")
 with torch.no_grad():
    outputs = model(**encoded)
    hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
    # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
    all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
    print(f"Hidden states shape: {hidden_states.shape}")
    print(f"All embeddings shape: {all_embeddings.shape}")
    print(f"Embedding dimension: {all_embeddings.shape[1]}")
    # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
    n_embd = all_embeddings.shape[1]
    n_embd_count = all_embeddings.shape[0]
    print()  # Empty line to match C++ output
    for j in range(n_embd_count):
        embedding = all_embeddings[j]
        print(f"embedding {j}: ", end="")
        # Print first 3 values
        for i in range(min(3, n_embd)):
            print(f"{embedding[i]:9.6f} ", end="")
        print(" ... ", end="")
        # Print last 3 values
        for i in range(n_embd - 3, n_embd):
            print(f"{embedding[i]:9.6f} ", end="")
        print()  # New line
    print()  # Final empty line to match C++ output
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
    # Save all embeddings flattened (matching what embedding.cpp would save if it did)
    flattened_embeddings = all_embeddings.flatten()
    flattened_embeddings.astype(np.float32).tofile(bin_filename)
    with open(txt_filename, "w") as f:
        f.write(f"# Model class: {model_name}\n")
        f.write(f"# Tokens: {token_strings}\n")
        f.write(f"# Shape: {all_embeddings.shape}\n")
        f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
        for j in range(n_embd_count):
            f.write(f"# Token {j} ({token_strings[j]}):\n")
            for i, value in enumerate(all_embeddings[j]):
                f.write(f"{j}_{i}: {value:.6f}\n")
            f.write("\n")
    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
    print("")
    print(f"Saved bin embeddings to: {bin_filename}")
    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/readme.md.template
+++ b/examples/model-conversion/scripts/readme.md.template
@ -0,0 +1,13 @@
 ---
 base_model:
 - {base_model}
 ---
 # {model_name} GGUF
 Recommended way to run this model:
 ```sh
 llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
 ```
 Then, access http://localhost:8080
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@ -0,0 +1,174 @@
 #!/usr/bin/env python3
 import numpy as np
 import sys
 import os
 import argparse
 from pathlib import Path
 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
    ref_var = np.var(reference)
    if ref_var == 0:
        nmse = float('inf') if mse > 0 else 0.0
        return mse, mse, ref_var
    nmse = mse / ref_var
    return nmse, mse, ref_var
 def load_logits(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    if file_path.suffix == '.npy':
        return np.load(file_path)
    elif file_path.suffix == '.bin':
        return np.fromfile(file_path, dtype=np.float32)
    else:
        # Try to load as text file
        try:
            # If it has index format "0: value", extract just values
            data = []
            with open(file_path, 'r') as f:
                for line in f:
                    if ':' in line:
                        # Format: "index: value"
                        value = float(line.split(':')[1].strip())
                    else:
                        # Just the value
                        value = float(line.strip())
                    data.append(value)
            return np.array(data, dtype=np.float32)
        except:
            return np.loadtxt(file_path, dtype=np.float32)
 def interpret_nmse(nmse):
    """Provide interpretation of NMSE value"""
    if nmse == 0:
        return "Perfect match", "🎉"
    elif nmse < 1e-6:
        return "Essentially identical", "✅"
    elif nmse < 1e-4:
        return "Excellent match", "✅"
    elif nmse < 1e-3:
        return "Very good match", "👍"
    elif nmse < 1e-2:
        return "Good match", "👍"
    elif nmse < 0.1:
        return "Acceptable match", "⚠️"
    elif nmse < 1.0:
        return "Poor match", "❌"
    else:
        return "Very poor match (worse than noise)", "❌"
 def main():
    parser = argparse.ArgumentParser(description='Validate model logits')
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()
    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
    print(f"Model name: {model_name}")
    print(f"PyTorch logits file: {pytorch_file}")
    print(f"llama.cpp logits file: {llamacpp_file}")
    reference_file = pytorch_file
    test_file = llamacpp_file
    print("📊 NMSE Check for Model Comparison")
    print("=" * 50)
    print(f"Reference (ground truth): {reference_file}")
    print(f"Test (to evaluate):       {test_file}")
    print()
    try:
        print("Loading reference logits...")
        reference = load_logits(reference_file)
        print(f"  Shape: {reference.shape}, Type: {reference.dtype}")
        print("Loading test logits...")
        test = load_logits(test_file)
        print(f"  Shape: {test.shape}, Type: {test.dtype}")
        # Check shapes match
        if reference.shape != test.shape:
            print(f"\n❌ Error: Shape mismatch!")
            print(f"  Reference: {reference.shape}")
            print(f"  Test: {test.shape}")
            sys.exit(1)
        print(f"\n✅ Shapes match: {reference.shape}")
        nmse, mse, ref_var = calculate_nmse(reference, test)
        # Additional metrics
        max_abs_error = np.max(np.abs(test - reference))
        mean_abs_error = np.mean(np.abs(test - reference))
        # Results
        print(f"\n📈 METRICS")
        print("=" * 30)
        print(f"MSE (Mean Squared Error):     {mse:.6e}")
        print(f"Reference Variance:           {ref_var:.6e}")
        print(f"NMSE:                         {nmse:.6e}")
        print(f"Max Absolute Error:           {max_abs_error:.6f}")
        print(f"Mean Absolute Error:          {mean_abs_error:.6f}")
        # NMSE in dB (common in signal processing)
        if nmse > 0:
            nmse_db = 10 * np.log10(nmse)
            print(f"NMSE (dB):                    {nmse_db:.2f} dB")
        # Interpretation
        interpretation, emoji = interpret_nmse(nmse)
        print(f"\n🎯 INTERPRETATION")
        print("=" * 30)
        print(f"{emoji} {interpretation}")
        # Detailed guidance
        print(f"\n📋 GUIDANCE")
        print("=" * 30)
        if nmse < 1e-3:
            print("✅ EXCELLENT: Your GGML conversion is working very well!")
            print("   The differences are negligible for practical use.")
        elif nmse < 1e-2:
            print("👍 GOOD: Your GGML conversion is working well.")
            print("   Small differences are likely due to precision/quantization.")
        elif nmse < 0.1:
            print("⚠️  ACCEPTABLE: Conversion is working but with some differences.")
            print("   Check if you're using quantization (Q4, Q8, etc.)")
            print("   Test generation quality to see if it's acceptable.")
        else:
            print("❌ PROBLEMATIC: Large differences detected.")
            print("   Check your conversion process for potential issues.")
            print("   Verify you're using the same model weights.")
        # NMSE benchmarks
        print(f"\n📚 NMSE BENCHMARKS")
        print("=" * 30)
        print("< 1e-6:  Essentially identical")
        print("< 1e-4:  Excellent (typical for good conversions)")
        print("< 1e-3:  Very good")
        print("< 1e-2:  Good (acceptable for most use cases)")
        print("< 0.1:   Acceptable (may need verification)")
        print("> 1.0:   Poor (worse than random)")
        # Exit code based on NMSE
        if nmse < 1e-2:
            print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
            sys.exit(0)
        else:
            print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
            sys.exit(1)
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/create-collection-add-model.sh
+++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@ -0,0 +1,6 @@
 COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
 echo "Created collection: $COLLECTION_SLUG"
 # Use it in the next command
 python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
--- a/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import sys
 def add_model_to_collection(collection_slug, model_id, note=""):
    """
    Add a model to an existing collection
    Args:
        collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
        model_id: The model repository ID (e.g., "username/model-name")
        note: Optional note about the model
    Returns:
        True if successful, False if failed
    """
    # Initialize API
    api = HfApi()
    try:
        user_info = api.whoami()
        print(f"✅ Authenticated as: {user_info['name']}")
        # Verify the model exists
        print(f"🔍 Checking if model exists: {model_id}")
        try:
            model_info = api.model_info(model_id)
        except Exception as e:
            print(f"❌ Model not found or not accessible: {model_id}")
            print(f"Error: {e}")
            return False
        print(f"📚 Adding model to collection...")
        api.add_collection_item(
            collection_slug=collection_slug,
            item_id=model_id,
            item_type="model",
            note=note
        )
        print(f"✅ Model added to collection successfully!")
        print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
        return True
    except Exception as e:
        print(f"❌ Error adding model to collection: {e}")
        return False
 def main():
    # This script requires that the environment variable HF_TOKEN is set with your
    # Hugging Face API token.
    api = HfApi()
    parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
    parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
    parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
    parser.add_argument('--note', '-n', help='An optional note/description', required=False)
    args = parser.parse_args()
    collection = args.collection
    model = args.model
    note = args.note
    success = add_model_to_collection(
        collection_slug=collection,
        model_id=model,
        note=note
    )
    if success:
        print("\n🎉 Model added successfully!")
    else:
        print("\n❌ Failed to add model to collection")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/hf-create-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-create-collection.py
@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import os
 import sys
 def create_collection(title, description, private=False, namespace=None, return_slug=False):
    """
    Create a new collection on Hugging Face
    Args:
        title: Collection title
        description: Collection description
        private: Whether the collection should be private (default: False)
        namespace: Optional namespace (defaults to your username)
    Returns:
        Collection object if successful, None if failed
    """
    # Check if HF_TOKEN is available
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
    if not token:
        print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
        print("Please set your Hugging Face token as an environment variable")
        return None
    # Initialize API
    api = HfApi()
    try:
        # Test authentication first
        user_info = api.whoami()
        if not return_slug:
            print(f"✅ Authenticated as: {user_info['name']}")
        # Create the collection
        if not return_slug:
            print(f"📚 Creating collection: '{title}'...")
        collection = api.create_collection(
            title=title,
            description=description,
            private=private,
            namespace=namespace
        )
        if not return_slug:
            print(f"✅ Collection created successfully!")
            print(f"📋 Collection slug: {collection.slug}")
            print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
        return collection
    except Exception as e:
        print(f"❌ Error creating collection: {e}")
        return None
 def main():
    # This script requires that the environment variable HF_TOKEN is set with your
    # Hugging Face API token.
    api = HfApi()
    parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
    parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
    parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
    parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
    parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true')  # Fixed
    parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true')  # Fixed
    args = parser.parse_args()
    name = args.name
    description = args.description
    private = args.private
    namespace = args.namespace
    return_slug = args.return_slug
    if not return_slug:
        print("🚀 Creating Hugging Face Collection")
        print(f"Title: {name}")
        print(f"Description: {description}")
        print(f"Namespace: {namespace}")
        print(f"Private: {private}")
    collection = create_collection(
        title=name,
        description=description,
        private=private,
        namespace=namespace,
        return_slug=return_slug
    )
    if collection:
        if return_slug:
            print(collection.slug)
        else:
            print("\n🎉 Collection created successfully!")
            print(f"Use this slug to add models: {collection.slug}")
    else:
        print("\n❌ Failed to create collection")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/hf-create-model.py
+++ b/examples/model-conversion/scripts/utils/hf-create-model.py
@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 # This script requires that the environment variable HF_TOKEN is set with your
 # Hugging Face API token.
 api = HfApi()
 def load_template_and_substitute(template_path, **kwargs):
    try:
        with open(template_path, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return template_content.format(**kwargs)
    except FileNotFoundError:
        print(f"Template file '{template_path}' not found!")
        return None
    except KeyError as e:
        print(f"Missing template variable: {e}")
        return None
 parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
 parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
 parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
 parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
 parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
 parser.add_argument('--private', '-p', action='store_true', help='Create private model')
 args = parser.parse_args()
 repo_id = f"{args.namespace}/{args.model_name}-GGUF"
 print("Repository ID: ", repo_id)
 repo_url = api.create_repo(
    repo_id=repo_id,
    repo_type="model",
    private=args.private,
    exist_ok=False
 )
 if not args.no_card:
    template_path = "scripts/readme.md.template"
    model_card_content = load_template_and_substitute(
        template_path,
        model_name=args.model_name,
        namespace=args.namespace,
        base_model=args.org_base_model,
    )
    if model_card_content:
        api.upload_file(
            path_or_fileobj=model_card_content.encode('utf-8'),
            path_in_repo="README.md",
            repo_id=repo_id
        )
        print("Model card created successfully.")
    else:
        print("Failed to create model card.")
 print(f"Repository created: {repo_url}")
--- a/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
+++ b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import os
 def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
    """
    Upload a GGUF file to a Hugging Face model repository
    Args:
        local_file_path: Path to your local GGUF file
        repo_id: Your repository ID (e.g., "username/model-name")
        filename_in_repo: Optional custom name for the file in the repo
    """
    if not os.path.exists(local_file_path):
        print(f"❌ File not found: {local_file_path}")
        return False
    if filename_in_repo is None:
        filename_in_repo = os.path.basename(local_file_path)
    if filename_in_repo is None or filename_in_repo == "":
        filename_in_repo = os.path.basename(local_file_path)
    print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
    api = HfApi()
    try:
        api.upload_file(
            path_or_fileobj=local_file_path,
            path_in_repo=filename_in_repo,
            repo_id=repo_id,
            repo_type="model",
            commit_message=f"Upload {filename_in_repo}"
        )
        print("✅ Upload successful!")
        print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
        return True
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False
 # This script requires that the environment variable HF_TOKEN is set with your
 # Hugging Face API token.
 api = HfApi()
 parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
 parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
 parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
 parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
 args = parser.parse_args()
 upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
--- a/examples/model-conversion/scripts/utils/inspect-converted-model.sh
+++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@ -0,0 +1,14 @@
 #!/bin/bash
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 ../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import json
 from safetensors import safe_open
 from collections import defaultdict
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 # Check if there's an index file (multi-file model)
 index_path = os.path.join(model_path, "model.safetensors.index.json")
 single_file_path = os.path.join(model_path, "model.safetensors")
 if os.path.exists(index_path):
    # Multi-file model
    print("Multi-file model detected")
    with open(index_path, 'r') as f:
        index_data = json.load(f)
    # Get the weight map (tensor_name -> file_name)
    weight_map = index_data.get("weight_map", {})
    # Group tensors by file for efficient processing
    file_tensors = defaultdict(list)
    for tensor_name, file_name in weight_map.items():
        file_tensors[file_name].append(tensor_name)
    print("Tensors in model:")
    # Process each shard file
    for file_name, tensor_names in file_tensors.items():
        file_path = os.path.join(model_path, file_name)
        print(f"\n--- From {file_name} ---")
        with safe_open(file_path, framework="pt") as f:
            for tensor_name in sorted(tensor_names):
                tensor = f.get_tensor(tensor_name)
                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
 elif os.path.exists(single_file_path):
    # Single file model (original behavior)
    print("Single-file model detected")
    with safe_open(single_file_path, framework="pt") as f:
        keys = f.keys()
        print("Tensors in model:")
        for key in sorted(keys):
            tensor = f.get_tensor(key)
            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
 else:
    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
    print("Available files:")
    if os.path.exists(model_path):
        for item in sorted(os.listdir(model_path)):
            print(f"  {item}")
    else:
        print(f"  Directory {model_path} does not exist")
    exit(1)
--- a/examples/model-conversion/scripts/utils/perplexity-gen.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh
@ -0,0 +1,35 @@
 #!/bin/bash
 set -e
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 # Check if data/wikitext-2-raw directory exists
 if [ ! -d "ppl/wikitext-2-raw" ]; then
    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
    mkdir -p ppl
    pushd ppl
    ./../../../scripts/get-wikitext-2.sh
    popd
 fi
 mkdir -p ppl
 OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
 echo "Model: $CONVERTED_MODEL"
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
    -f ppl/wikitext-2-raw/wiki.test.raw \
    --kl-divergence-base $OUTPUTFILE
 echo "Generated logits in $OUTPUTFILE"
--- a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@ -0,0 +1,27 @@
 #!/bin/bash
 set -e
 QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
 if [ -z "$QUANTIZED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. QUANTIZED_MODEL environment variable" >&2
    exit 1
 fi
 # Check if data/wikitext-2-raw directory exists
 if [ ! -d "ppl/wikitext-2-raw" ]; then
    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
    mkdir -p ppl
    pushd ppl
    ./../../../scripts/get-wikitext-2.sh
    popd
 fi
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
--- a/examples/model-conversion/scripts/utils/perplexity-run.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run.sh
@ -0,0 +1,28 @@
 #!/bin/bash
 set -e
 QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
 LOGITS_FILE="${1:-"$LOGITS_FILE"}"
 if [ -z "$QUANTIZED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. QUANTIZED_MODEL environment variable" >&2
    exit 1
 fi
 if [ ! -f ${LOGITS_FILE} ]; then
    echo "Error: logits file '${LOGITS_FILE} was not found"
    echo "Did you run the perplexity-gen.sh script?"
    exit 1
 fi
 echo "Model: $QUANTIZED_MODEL"
 echo "Data file: $LOGITS_FILE"
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
    --kl-divergence-base $LOGITS_FILE \
    --kl-divergence
--- a/examples/model-conversion/scripts/utils/quantize.sh
+++ b/examples/model-conversion/scripts/utils/quantize.sh
@ -0,0 +1,34 @@
 #!/bin/bash
 set -e
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
 QUANTIZED_MODEL=$CONVERTED_MODEL
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 # Process the quantized model filename
 if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
    # Remove .gguf suffix, add quantized type, then add .gguf back
    BASE_NAME="${QUANTIZED_MODEL%.gguf}"
    QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
 else
    echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
    exit 1
 fi
 cmake --build ../../build --target llama-quantize -j8
 ../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
 echo "Quantized model saved to: $QUANTIZED_MODEL"
--- a/examples/model-conversion/scripts/utils/run-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 set -e
 #
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-server
 ../../build/bin/llama-server -m $CONVERTED_MODEL \
    --embedding \
    --pooling none
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 import numpy as np
 import argparse
 import os
 import importlib
 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 def cosine_similarity(a, b=None):
    a = np.asarray(a)
    if b is None:
        b = a
    else:
        b = np.asarray(b)
    if a.ndim == 1:
        a = a.reshape(1, -1)
    if b.ndim == 1:
        b = b.reshape(1, -1)
    a_norms = np.linalg.norm(a, axis=1, keepdims=True)
    b_norms = np.linalg.norm(b, axis=1, keepdims=True)
    a_norms = np.where(a_norms == 0, 1e-8, a_norms)
    b_norms = np.where(b_norms == 0, 1e-8, b_norms)
    a_normalized = a / a_norms
    b_normalized = b / b_norms
    # Compute cosine similarity
    return np.dot(a_normalized, b_normalized.T)
 def load_embeddings_from_file(filename, n_tokens, n_embd):
    embeddings = np.fromfile(filename, dtype=np.float32)
    return embeddings.reshape(n_tokens, n_embd)
 def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
    np.set_printoptions(suppress=True, precision=6)
    print("pytorch embeddings:");
    print(python_emb)
    print("llama.cpp embeddings:");
    print(cpp_emb)
    print(f"\n=== Prompt: '{prompt}' ===")
    print(f"Tokens: {tokens}")
    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
    n_tokens = len(tokens)
    # 1. Direct embedding comparison
    print(f"\n1. Raw Embedding Magnitude Comparison:")
    # Check if the distance of each token embedding from the origin and compare
    # if the vectors are on the same "sphere". This does not tell us about
    # direction (meaning of the token embedding), just magnitude.
    for i in range(n_tokens):
        py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
        cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
        print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
    # 2. Cosine similarity between tokens within each model
    # Here we check the direction of token embeddings to see if the have the
    # same meaning (similarity). This is done by calculating cosine similarity
    # of a pair of token embeddings within each model.
    print(f"\n2. Within-Model Token Similarities:")
    print("   Python model:")
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
    print("   llama.cpp model:")
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
    # 3. Cross-model similarity (same token position)
    print(f"\n3. Cross-Model Same-Token Similarities:")
    for i in range(n_tokens):
        sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
        print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
    # 4. Similarity matrix comparison
    print(f"\n4. Similarity Matrix Differences:")
    py_sim_matrix = cosine_similarity(python_emb)
    cpp_sim_matrix = cosine_similarity(cpp_emb)
    diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
    print(f"   Max difference: {np.max(diff_matrix):.4f}")
    print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
    print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
    return {
        'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
        'similarity_matrix_diff': diff_matrix,
        'max_diff': np.max(diff_matrix),
        'mean_diff': np.mean(diff_matrix),
        'rms_diff': np.sqrt(np.mean(diff_matrix**2))
    }
 def main():
    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
    parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
    args = parser.parse_args()
    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
    print("=" * 70)
    # Single prompt detailed comparison
    print(f"\nTesting with prompt: '{args.prompt}'")
    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    config = AutoConfig.from_pretrained(args.model_path)
    if unreleased_model_name:
        model_name_lower = unreleased_model_name.lower()
        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
        if args.causal:
            class_name = f"{unreleased_model_name}ForCausalLM"
        else:
            class_name = f"{unreleased_model_name}Model"
        print(f"Model class: {class_name}")
        print(f"Importing unreleased model module: {unreleased_module_path}")
        try:
            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
            model = model_class.from_pretrained(args.model_path)
        except (ImportError, AttributeError) as e:
            print(f"Failed to import or load model: {e}")
            exit(1)
    else:
        if args.causal:
            model = AutoModelForCausalLM.from_pretrained(args.model_path)
        else:
            model = AutoModel.from_pretrained(args.model_path)
    encoded = tokenizer(args.prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    n_tokens = len(tokens)
    print(f"n_tokens: {n_tokens}");
    print(f"hidden_size: {model.config.hidden_size}")
    # Load binary embeddings from data directory.
    llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
    # Run comparison
    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
    # Summary
    print(f"\n=== SUMMARY ===")
    avg_cross_sim = np.mean(results['cross_model_similarities'])
    print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
    print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
    # Quality assessment
    if avg_cross_sim > 0.95:
        print("✅ EXCELLENT: Models are highly similar")
    elif avg_cross_sim > 0.90:
        print("✅ VERY GOOD: Models are very similar")
    elif avg_cross_sim > 0.80:
        print("⚠️  GOOD: Models are reasonably similar")
    elif avg_cross_sim > 0.70:
        print("⚠️  FAIR: Models have some differences")
    else:
        print("❌ POOR: Models are significantly different")
 if __name__ == "__main__":
    main()