Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile # ggml-metal.m
2025-09-10 17:14:36 +00:00 · 2024-03-15 10:37:48 +08:00 · 2024-03-15 10:37:48 +08:00 · 93d3871056
commit 93d3871056
parent f20fb7d778 4755afd1cb
22 changed files with 341 additions and 198 deletions
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -25,17 +25,14 @@ jobs:
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
        include:
          - build_type: Release
            sanitizer: ""
-        exclude:
+          - build_type: Debug
          - build_type: Release
            sanitizer: ADDRESS
          - build_type: Release
            sanitizer: THREAD
-          - build_type: Release
+            disabled_on_pr: true
-            sanitizer: UNDEFINED
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    container:
      image: ubuntu:latest
@ -81,13 +78,14 @@ jobs:
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
@ -124,13 +122,14 @@ jobs:
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
      - name: Slow tests
        id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          behave.exe --stop --no-skipped --no-capture --tags slow
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,8 @@
 .vs/
 .vscode/
 ggml-metal-embed.metal
 lcov-report/
 gcovr-report/
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1878,3 +1878,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
    }
 }
 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
    double sum  = 0.0;
    double sum1 = 0.0;
    double sum2 = 0.0;
    for (int i = 0; i < n; i++) {
        sum  += embd1[i] * embd2[i];
        sum1 += embd1[i] * embd1[i];
        sum2 += embd2[i] * embd2[i];
    }
    return sum / (sqrt(sum1) * sqrt(sum2));
 }
--- a/common/common.h
+++ b/common/common.h
@ -282,3 +282,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 void llama_embd_normalize(const float * inp, float * out, int n);
 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
--- a/convert.py
+++ b/convert.py
@ -332,6 +332,9 @@ class Params:
 #
 class BpeVocab:
    tokenizer_model = "gpt2"
    name = "bpe"
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        if isinstance(self.bpe_tokenizer.get('model'), dict):
@ -390,6 +393,9 @@ class BpeVocab:
 class SentencePieceVocab:
    tokenizer_model = "llama"
    name = "spm"
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
@ -453,6 +459,9 @@ class SentencePieceVocab:
 class HfVocab:
    tokenizer_model = "llama"
    name = "hfft"
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
        try:
            from transformers import AutoTokenizer
@ -553,7 +562,15 @@ class HfVocab:
        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
+class NoVocab:
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
 Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
 #
@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
    # Handle special case where the model's vocab size is not set
    if params.n_vocab == -1:
        raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
+            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
        )
    if isinstance(vocab, NoVocab):
        return  # model has no vocab
    # Check for a vocab size mismatch
    if params.n_vocab == vocab.vocab_size:
@ -977,6 +996,7 @@ class OutputFile:
            name = str(params.path_model.parent).split('/')[-1]
        self.gguf.add_name                (name)
        self.gguf.add_vocab_size          (params.n_vocab)
        self.gguf.add_context_length      (params.n_ctx)
        self.gguf.add_embedding_length    (params.n_embd)
        self.gguf.add_block_count         (params.n_layer)
@ -1013,21 +1033,9 @@ class OutputFile:
        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)
    def handle_tokenizer_model(self, vocab: Vocab) -> str:
        # Map the vocab types to the supported tokenizer models
        tokenizer_model = {
            SentencePieceVocab: "llama",
            HfVocab: "llama",
            BpeVocab: "gpt2",
        }.get(type(vocab))
        # Block if vocab type is not predefined
        if tokenizer_model is None:
            raise ValueError("Unknown vocab type: Not supported")
        return tokenizer_model
    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        assert not isinstance(vocab, NoVocab)
        tokens = []
        scores = []
        toktypes = []
@ -1043,11 +1051,8 @@ class OutputFile:
        return tokens, scores, toktypes
    def add_meta_vocab(self, vocab: Vocab) -> None:
        # Handle the tokenizer model
        tokenizer_model = self.handle_tokenizer_model(vocab)
        # Ensure that tokenizer_model is added to the GGUF model
-        self.gguf.add_tokenizer_model(tokenizer_model)
+        self.gguf.add_tokenizer_model(vocab.tokenizer_model)
        # Extract model vocabulary for model conversion
        tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@ -1074,6 +1079,26 @@ class OutputFile:
    def write_tensor_info(self) -> None:
        self.gguf.write_ti_data_to_file()
    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
        start = time.time()
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
            )
            self.gguf.write_tensor_data(ndarray)
    def close(self) -> None:
        self.gguf.close()
@ -1082,7 +1107,7 @@ class OutputFile:
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -1120,8 +1145,11 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
+        if isinstance(vocab, NoVocab):
-        of.add_meta_special_vocab(svocab)
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
        else:
            of.add_meta_vocab(vocab)
            of.add_meta_special_vocab(svocab)
        # tensor info
        for name, lazy_tensor in model.items():
@ -1131,24 +1159,7 @@ class OutputFile:
        of.write_tensor_info()
        # tensor data
-        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
+        of.write_tensor_data(ftype, model, concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
        start = time.time()
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
            )
            of.gguf.write_tensor_data(ndarray)
        of.close()
@ -1309,8 +1320,8 @@ class VocabFactory:
                return vtype, path
        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
-    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
+    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
-        load_merges = vocabtype == "bpe"
+        load_merges = vocab.name == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
            model_parent_path,
@ -1319,30 +1330,34 @@ class VocabFactory:
            n_vocab=n_vocab,
        )
-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
        vocab_type, path = self._select_file(vocab_types)
        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
        added_tokens_path = path.parent / "added_tokens.json"
        vocab: Vocab
        if vocab_type == "bpe":
-            vocab = BpeVocab(
+            return BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "spm":
+        if vocab_type == "spm":
-            vocab = SentencePieceVocab(
+            return SentencePieceVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "hfft":
+        if vocab_type == "hfft":
-            vocab = HfVocab(
+            return HfVocab(
                path.parent, added_tokens_path if added_tokens_path.exists() else None
            )
        raise ValueError(vocab_type)
    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        vocab: Vocab
        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
            vocab = NoVocab()
        else:
-            raise ValueError(vocab_type)
+            vocab = self._create_vocab_by_path(vocab_types)
        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocab_type,
            model_parent_path,
        )
        return vocab, special_vocab
@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    args = parser.parse_args(args_in)
    if args.no_vocab:
        if args.vocab_only:
            raise ValueError("no need to specify --vocab-only if using --no-vocab")
        args.vocab_type = "no_vocab"
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
        print(f"Wrote {outfile}")
        return
-    if model_plus.vocab is not None and args.vocab_dir is None:
+    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab
    print(f"Vocab info: {vocab}")
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -113,13 +113,20 @@ int main(int argc, char ** argv) {
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
            inp.resize(n_batch);
        }
        inputs.push_back(inp);
    }
    // add eos if not present
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_eos(model)) {
            inp.push_back(llama_token_eos(model));
        }
    }
    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) inputs.size(); i++) {
@ -168,15 +175,26 @@ int main(int argc, char ** argv) {
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);
-    // print first 3 embeddings
+    // print the first part of the embeddings
-    for (int j = 0; j < std::min(3, n_prompts); j++) {
+    fprintf(stdout, "\n");
-        fprintf(stderr, "embedding %d: ", j);
+    for (int j = 0; j < n_prompts; j++) {
-        for (int i = 0; i < n_embd; i++) {
+        fprintf(stdout, "embedding %d: ", j);
-            fprintf(stderr, "%f ", emb[j * n_embd + i]);
+        for (int i = 0; i < std::min(16, n_embd); i++) {
            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
        }
-        fprintf(stderr, "\n\n");
+        fprintf(stdout, "\n");
    }
    // print cosine similarity matrix
    fprintf(stdout, "\n");
    printf("cosine similarity matrix:\n\n");
    for (int i = 0; i < n_prompts; i++) {
        for (int j = 0; j < n_prompts; j++) {
            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
            fprintf(stdout, "%6.2f ", sim);
        }
        fprintf(stdout, "\n");
    }
    fprintf(stderr, "\n");
    // clean up
    llama_print_timings(ctx);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
                        gguf_free(ctx);
                        return false;
                    }
                }
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -6,22 +6,6 @@
 // #define GRIT_DEBUG
 static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
    float dot = 0.0f;
    for (uint64_t i = 0; i < v1.size(); ++i) {
        dot += v1[i] * v2[i];
    }
    return dot;
 }
 static float norm(const std::vector<float> & v) {
    return std::sqrt(dot_product(v, v));
 }
 static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
    return dot_product(v1, v2) / (norm(v1) * norm(v2));
 }
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;
@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
-        const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
+        const int n_embd = llama_n_embd(mdl);
-        const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
+
-        const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
+        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
        const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -104,6 +104,7 @@ static std::string get_cpu_info() {
                }
            }
        }
        fclose(f);
    }
 #endif
    // TODO: other platforms
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
-2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+
 2) Install the required Python packages:
 ```sh
 pip install -r examples/llava/requirements.txt
 ```
 3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+
 4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```
-4) Create the visual gguf model:
+5) Create the visual gguf model:
 ```console
 python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
-5) Then convert the model to gguf format:
+6) Then convert the model to gguf format:
 ```console
 python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
-6) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava-cli using the 1.6 model version:
 ```console
 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        if (!new_clip->ctx_data) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
        }
@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        if (!fin) {
            printf("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
        }
@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            if (!fin) {
                printf("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
            }
            int num_bytes = ggml_nbytes(cur);
@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                break;
            default:
                printf("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -119,6 +119,10 @@ def step_server_metrics(context):
 def step_start_server(context):
    start_server_background(context)
    attempts = 0
    max_attempts = 20
    if 'GITHUB_ACTIONS' in os.environ:
        max_attempts *= 2
    while True:
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            result = sock.connect_ex((context.server_fqdn, context.server_port))
@ -126,7 +130,7 @@ def step_start_server(context):
                print("\x1b[33;46mserver started!\x1b[0m")
                return
            attempts += 1
-            if attempts > 20:
+            if attempts > max_attempts:
                assert False, "server not started"
            print(f"waiting for server to start, connect error code = {result}...")
            time.sleep(0.1)
@ -943,6 +947,9 @@ async def wait_for_health_status(context,
        print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
    interval = 0.5
    counter = 0
    if 'GITHUB_ACTIONS' in os.environ:
        timeout *= 2
    async with aiohttp.ClientSession() as session:
        while True:
            async with await session.get(f'{base_url}/health', params=params) as health_response:
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
    gguf_free(fctx);
    return true;
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    id<MTLLibrary> metal_library;
    // load library
    //
    // - first check if the library is embedded
    // - then check if the library is in the bundle
    // - if not found, load the source and compile it
    // - if that fails, return NULL
    {
        NSBundle * bundle = nil;
 #ifdef SWIFT_PACKAGE
@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #else
        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
+
-        if (libPath != nil) {
+#if GGML_METAL_EMBED_LIBRARY
        const bool try_metallib = false;
 #else
        const bool try_metallib = true;
 #endif
        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
        if (try_metallib && path_lib != nil) {
            // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
            if (error) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            extern const char ggml_metallib_start[];
            extern const char ggml_metallib_end[];
-            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
-            NSString * sourcePath;
+            NSString * path_source;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
+            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
-            if (ggmlMetalPathResources) {
+            if (path_resource) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
            } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
+                path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
            }
-            if (sourcePath == nil) {
+
            if (path_source == nil) {
                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal-merged.metal";
+                path_source = @"ggml-metal.metal";
            }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
+
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
            if (error) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
-#endif
+#endif // GGML_METAL_EMBED_LIBRARY
            @autoreleasepool {
                // dictionary of preprocessor macros
--- a/ggml.h
+++ b/ggml.h
@ -344,24 +344,24 @@ extern "C" {
    struct ggml_object;
    struct ggml_context;
    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
-        GGML_TYPE_F32  = 0,
+        GGML_TYPE_F32     = 0,
-        GGML_TYPE_F16  = 1,
+        GGML_TYPE_F16     = 1,
-        GGML_TYPE_Q4_0 = 2,
+        GGML_TYPE_Q4_0    = 2,
-        GGML_TYPE_Q4_1 = 3,
+        GGML_TYPE_Q4_1    = 3,
        // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
+        // GGML_TYPE_Q4_3 = 5, support has been removed
-        GGML_TYPE_Q5_0 = 6,
+        GGML_TYPE_Q5_0    = 6,
-        GGML_TYPE_Q5_1 = 7,
+        GGML_TYPE_Q5_1    = 7,
-        GGML_TYPE_Q8_0 = 8,
+        GGML_TYPE_Q8_0    = 8,
-        GGML_TYPE_Q8_1 = 9,
+        GGML_TYPE_Q8_1    = 9,
-        // k-quantizations
+        GGML_TYPE_Q2_K    = 10,
-        GGML_TYPE_Q2_K = 10,
+        GGML_TYPE_Q3_K    = 11,
-        GGML_TYPE_Q3_K = 11,
+        GGML_TYPE_Q4_K    = 12,
-        GGML_TYPE_Q4_K = 12,
+        GGML_TYPE_Q5_K    = 13,
-        GGML_TYPE_Q5_K = 13,
+        GGML_TYPE_Q6_K    = 14,
-        GGML_TYPE_Q6_K = 14,
+        GGML_TYPE_Q8_K    = 15,
        GGML_TYPE_Q8_K = 15,
        GGML_TYPE_IQ2_XXS = 16,
        GGML_TYPE_IQ2_XS  = 17,
        GGML_TYPE_IQ3_XXS = 18,
@ -370,9 +370,9 @@ extern "C" {
        GGML_TYPE_IQ3_S   = 21,
        GGML_TYPE_IQ2_S   = 22,
        GGML_TYPE_IQ4_XS  = 23,
-        GGML_TYPE_I8,
+        GGML_TYPE_I8      = 24,
-        GGML_TYPE_I16,
+        GGML_TYPE_I16     = 25,
-        GGML_TYPE_I32,
+        GGML_TYPE_I32     = 26,
        GGML_TYPE_COUNT,
    };
@ -390,20 +390,20 @@ extern "C" {
    // model file types
    enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_UNKNOWN        = -1,
-        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_ALL_F32        = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -32,6 +32,7 @@ class Keys:
        FILE_TYPE            = "general.file_type"
    class LLM:
        VOCAB_SIZE            = "{arch}.vocab_size"
        CONTEXT_LENGTH        = "{arch}.context_length"
        EMBEDDING_LENGTH      = "{arch}.embedding_length"
        BLOCK_COUNT           = "{arch}.block_count"
@ -661,6 +662,9 @@ class GGMLQuantizationType(IntEnum):
    IQ3_S   = 21
    IQ2_S   = 22
    IQ4_XS  = 23
    I8      = 24
    I16     = 25
    I32     = 26
 class GGUFEndian(IntEnum):
@ -727,6 +731,9 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
    GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
    GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
    GGMLQuantizationType.I8:      (1, 1),
    GGMLQuantizationType.I16:     (1, 2),
    GGMLQuantizationType.I32:     (1, 4),
 }
@ -746,6 +753,7 @@ KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
 KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE
 # LLM
 KEY_VOCAB_SIZE            = Keys.LLM.VOCAB_SIZE
 KEY_CONTEXT_LENGTH        = Keys.LLM.CONTEXT_LENGTH
 KEY_EMBEDDING_LENGTH      = Keys.LLM.EMBEDDING_LENGTH
 KEY_BLOCK_COUNT           = Keys.LLM.BLOCK_COUNT
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -248,6 +248,15 @@ class GGUFReader:
            elif ggml_type == GGMLQuantizationType.F16:
                item_count = n_elems
                item_type = np.float16
            elif ggml_type == GGMLQuantizationType.I8:
                item_count = n_elems
                item_type = np.int8
            elif ggml_type == GGMLQuantizationType.I16:
                item_count = n_elems
                item_type = np.int16
            elif ggml_type == GGMLQuantizationType.I32:
                item_count = n_elems
                item_type = np.int32
            else:
                item_count = n_bytes
                item_type = np.uint8
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -196,9 +196,6 @@ class GGUFWriter:
        if self.state is not WriterState.EMPTY:
            raise ValueError(f'Expected output file to be empty, got {self.state}')
        if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
            raise ValueError("Only F32 and F16 tensors are supported for now")
        encoded_name = name.encode("utf8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
@ -207,7 +204,18 @@ class GGUFWriter:
        for i in range(n_dims):
            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
-            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+            if tensor_dtype == np.float32:
                dtype = GGMLQuantizationType.F32
            elif tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
            elif tensor_dtype == np.int8:
                dtype = GGMLQuantizationType.I8
            elif tensor_dtype == np.int16:
                dtype = GGMLQuantizationType.I16
            elif tensor_dtype == np.int32:
                dtype = GGMLQuantizationType.I32
            else:
                raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
        else:
            dtype = raw_dtype
        self.ti_data += self._pack("I", dtype)
@ -313,6 +321,9 @@ class GGUFWriter:
        self.data_alignment = alignment
        self.add_uint32(Keys.General.ALIGNMENT, alignment)
    def add_vocab_size(self, size: int) -> None:
        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
    def add_context_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.7.0"
+version = "0.8.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -616,7 +616,7 @@ maxhordelen = 256
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.61.2"
+KcppVersion = "1.62"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
--- a/llama.cpp
+++ b/llama.cpp
@ -282,6 +282,7 @@ enum llm_kv {
    LLM_KV_GENERAL_SOURCE_URL,
    LLM_KV_GENERAL_SOURCE_HF_REPO,
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_BLOCK_COUNT,
@ -345,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_SOURCE_URL,            "general.source.url"                    },
    { LLM_KV_GENERAL_SOURCE_HF_REPO,        "general.source.huggingface.repository" },
    { LLM_KV_VOCAB_SIZE,                    "%s.vocab_size"            },
    { LLM_KV_CONTEXT_LENGTH,                "%s.context_length"        },
    { LLM_KV_EMBEDDING_LENGTH,              "%s.embedding_length"      },
    { LLM_KV_BLOCK_COUNT,                   "%s.block_count"           },
@ -3288,10 +3290,11 @@ static const char * llama_model_type_name(e_model type) {
 static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
    switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM: return "SPM";
+        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
-        case LLAMA_VOCAB_TYPE_BPE: return "BPE";
+        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
-        case LLAMA_VOCAB_TYPE_WPM: return "WPM";
+        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
-        default:                   return "unknown";
+        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
        default:                    return "unknown";
    }
 }
@ -3323,14 +3326,14 @@ static void llm_load_hparams(
    ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
    // get hparams kv
-    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
+    ml.get_key(LLM_KV_VOCAB_SIZE,           hparams.n_vocab,       false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
-    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
+    ml.get_key(LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
-    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
+    ml.get_key(LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
-    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
+    ml.get_key(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
-    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
+    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
-    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
+    ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
-    ml.get_key  (LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
+    ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
-    ml.get_key  (LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
    GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
    GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@ -3692,30 +3695,25 @@ static void llm_load_vocab(
    const auto kv = LLM_KV(model.arch);
    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
    if (token_idx == -1) {
        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
    }
    const float * scores = nullptr;
    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
    if (score_idx != -1) {
        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    }
    const int * toktypes = nullptr;
    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
    if (toktype_idx != -1) {
        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
    }
    // determine vocab type
    {
        std::string tokenizer_name;
        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
-        if (tokenizer_name == "llama") {
+        if (tokenizer_name == "no_vocab") {
            vocab.type = LLAMA_VOCAB_TYPE_NONE;
            // default special tokens
            vocab.special_bos_id = -1;
            vocab.special_eos_id = -1;
            vocab.special_unk_id = -1;
            vocab.special_sep_id = -1;
            vocab.special_pad_id = -1;
            vocab.linefeed_id    = -1;
            return;
        } else if (tokenizer_name == "llama") {
            vocab.type = LLAMA_VOCAB_TYPE_SPM;
            // default special tokens
@ -3790,6 +3788,23 @@ static void llm_load_vocab(
        }
    }
    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
    if (token_idx == -1) {
        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
    }
    const float * scores = nullptr;
    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
    if (score_idx != -1) {
        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    }
    const int * toktypes = nullptr;
    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
    if (toktype_idx != -1) {
        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
    }
    const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
    vocab.id_to_token.resize(n_vocab);
@ -3997,7 +4012,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
    LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
    LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
-    LLAMA_LOG_INFO("%s: causal attm      = %d\n",     __func__, hparams.causal_attn);
+    LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
    LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
    LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
@ -5095,7 +5110,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
        llm_load_print_meta(ml, model);
-        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
+        if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
            model.hparams.n_vocab != model.vocab.id_to_token.size()) {
            throw std::runtime_error("vocab size mismatch");
        }
@ -9108,8 +9124,8 @@ static int llama_decode_internal(
    //llama_synchronize(&lctx);
    // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold >= 0.0f) {
+    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
+        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
        // queue defragmentation for next llama_kv_cache_update
        if (fragmentation > cparams.defrag_thold) {
@ -9141,6 +9157,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
    // number of cells moved
    uint32_t n_moves = 0;
    // each move requires 6*n_layer tensors (see build_defrag)
    //   - source view, destination view, copy operation
    //   - x2 for keys and values
    const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
    // determine which KV cells to move where
    //
    //  cell i moves to ids[i]
@ -9167,15 +9188,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            nh++;
        }
        // each move requires 6*n_layer tensors (see build_defrag)
        //   - source view, destination view, copy operation
        //   - x2 for keys and values
        //
        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
            // the graph is too big, we cannot move more cells
            break;
        }
        uint32_t nf = 0;
        uint32_t is = n_kv - 1;
@ -9205,11 +9217,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
        // are we moving a continuous block of memory?
        bool cont = false;
        // should we stop searching for the next move?
        bool stop = false;
        // go back and move the nf cells to the hole
        for (; i1 < n_kv; ++i1) {
            auto & cell1 = kv_self.cells[i1];
            if (cell1.is_empty() || ids[i1] != n_kv) {
                if (n_moves == max_moves) {
                    stop = true;
                    break;
                }
                cont = false;
                continue;
            }
@ -9236,6 +9256,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
            }
        }
        if (stop || n_moves == max_moves) {
            break;
        }
        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
        i0 += nh - 1;
@ -9425,26 +9449,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
 }
 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }
 static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
 }
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto& token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
@ -9466,6 +9496,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
        case LLAMA_VOCAB_TYPE_SPM: {
@ -10527,6 +10558,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                    }
                }
            } break;
        case LLAMA_VOCAB_TYPE_NONE:
            GGML_ASSERT(false);
    }
    return output;
@ -12261,7 +12294,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
    return new_type;
 }
-static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
    std::mutex mutex;
    int counter = 0;
    size_t new_size = 0;
@ -13437,7 +13470,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 }
 int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
+    return model->hparams.n_vocab;
 }
 int32_t llama_n_ctx_train(const struct llama_model * model) {
@ -14271,14 +14304,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
 }
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].text.c_str();
 }
 float llama_token_get_score(const struct llama_model * model, llama_token token) {
    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].score;
 }
 llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return model->vocab.id_to_token[token].type;
 }
--- a/llama.h
+++ b/llama.h
@ -59,9 +59,10 @@ extern "C" {
    typedef int32_t llama_seq_id;
    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
+        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
    };
    // note: these values should be synchronized with ggml_rope
`@ -282,3 +282,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40`

	`void llama_embd_normalize(const float * inp, float * out, int n);`	`void llama_embd_normalize(const float * inp, float * out, int n);`

		`float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);`