remove old filever checks (+7 squashed commit)

Squashed commit: [b72627a] new format not working [e568870] old ver works [7053b77] compile errors fixed, fixing linkers [4ae8889] add new ver [ff82dfd] file format checks [25b8aa8] refactoring type names [931063b] still merging
2025-09-15 19:39:42 +00:00 · 2023-05-20 16:47:44 +08:00 · 2023-05-20 16:47:44 +08:00 · c048bcfec4
commit c048bcfec4
parent 417302b226
25 changed files with 3180 additions and 506 deletions
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@ -5,7 +5,7 @@
 #include <cstdio>
 #endif

-#include "llama-util.h"
+#include "llama_v2-util.h"
 #include "llama_v2.h"

 #include "ggml_v2.h"
@ -39,67 +39,67 @@
 #define LLAMA_V2_MAX_SCRATCH_BUFFERS 16

 // available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_7B,
-    MODEL_13B,
-    MODEL_30B,
-    MODEL_65B,
+enum e_model2 {
+    MODEL_UNKNOWN_2,
+    MODEL_7B_2,
+    MODEL_13B_2,
+    MODEL_30B_2,
+    MODEL_65B_2,
 };

-static const size_t MB = 1024*1024;
+static const size_t MB_2 = 1024*1024;

 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
 //       needs modifications in ggml

-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 512ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 512ull * MB_2 },
+        { MODEL_7B_2,    512ull * MB_2 },
+        { MODEL_13B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_65B_2,  1024ull * MB_2 },
    };
    return k_sizes;
 }

-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
+static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 512ull * MB },
-        { MODEL_7B,    512ull * MB },
-        { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
-        { MODEL_65B,  1024ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 512ull * MB_2 },
+        { MODEL_7B_2,    512ull * MB_2 },
+        { MODEL_13B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_65B_2,  1024ull * MB_2 },
    };
    return k_sizes;
 }

 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
-static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
+static const std::map<e_model2, size_t> & MEM_REQ_KV_SELF_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN, 1026ull * MB },
-        { MODEL_7B,   1026ull * MB },
-        { MODEL_13B,  1608ull * MB },
-        { MODEL_30B,  3124ull * MB },
-        { MODEL_65B,  5120ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2, 1026ull * MB_2 },
+        { MODEL_7B_2,   1026ull * MB_2 },
+        { MODEL_13B_2,  1608ull * MB_2 },
+        { MODEL_30B_2,  3124ull * MB_2 },
+        { MODEL_65B_2,  5120ull * MB_2 },
    };
    return k_sizes;
 }

 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model2, size_t> & MEM_REQ_EVAL_2()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_UNKNOWN,   800ull * MB },
-        { MODEL_7B,   800ull * MB },
-        { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
-        { MODEL_65B, 1536ull * MB },
+    static std::map<e_model2, size_t> k_sizes = {
+        { MODEL_UNKNOWN_2,   800ull * MB_2 },
+        { MODEL_7B_2,   800ull * MB_2 },
+        { MODEL_13B_2, 1024ull * MB_2 },
+        { MODEL_30B_2, 1280ull * MB_2 },
+        { MODEL_65B_2, 1536ull * MB_2 },
    };
    return k_sizes;
 }
@ -157,7 +157,7 @@ struct llama_v2_kv_cache {
 };

 struct llama_v2_model {
-    e_model type = MODEL_UNKNOWN;
+    e_model2 type = MODEL_UNKNOWN_2;

    llama_v2_hparams hparams;

@ -276,7 +276,7 @@ struct llama_v2_context {
 };

 template <typename T>
-static T checked_mul(T a, T b) {
+static T checked_mul2(T a, T b) {
    T ret = a * b;
    if (a != 0 && ret / a != b) {
        throw format("overflow multiplying %llu * %llu",
@ -285,7 +285,7 @@ static T checked_mul(T a, T b) {
    return ret;
 }

-static size_t checked_div(size_t a, size_t b) {
+static size_t checked_div2(size_t a, size_t b) {
    if (b == 0 || a % b != 0) {
        throw format("error dividing %zu / %zu", a, b);
    }
@ -304,7 +304,7 @@ static std::string llama_v2_format_tensor_shape(const std::vector<uint32_t> & ne
 static size_t llama_v2_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_v2_type type) {
    size_t size = ggml_v2_type_size(type);
    for (uint32_t dim : ne) {
-        size = checked_mul<size_t>(size, dim);
+        size = checked_mul2<size_t>(size, dim);
    }
    return size / ggml_v2_blck_size(type);
 }
@ -322,9 +322,9 @@ struct llama_v2_load_tensor_shard {
 };

 enum llama_v2_split_type {
-    SPLIT_NONE,
-    SPLIT_BY_COLUMNS,
-    SPLIT_BY_ROWS
+    SPLIT_NONE_2,
+    SPLIT_BY_COLUMNS_2,
+    SPLIT_BY_ROWS_2
 };

 struct llama_v2_load_tensor {
@ -332,7 +332,7 @@ struct llama_v2_load_tensor {

    std::string name;
    enum ggml_v2_type type = GGML_V2_TYPE_F32;
-    llama_v2_split_type split_type = SPLIT_NONE;
+    llama_v2_split_type split_type = SPLIT_NONE_2;
    std::vector<uint32_t> ne;
    size_t size;
    struct ggml_v2_tensor * ggml_v2_tensor = NULL;
@ -360,13 +360,13 @@ struct llama_v2_load_tensor {
    void calc_split_type() {
        if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
            shards.size() == 1) { // only one file?
-            split_type = SPLIT_NONE;
+            split_type = SPLIT_NONE_2;
        } else if (name.find("tok_embeddings.") == 0 ||
            name.find(".attention.wo.weight") != std::string::npos ||
            name.find(".feed_forward.w2.weight") != std::string::npos) {
-            split_type = SPLIT_BY_COLUMNS;
+            split_type = SPLIT_BY_COLUMNS_2;
        } else {
-            split_type = SPLIT_BY_ROWS;
+            split_type = SPLIT_BY_ROWS_2;
        }
    }

@ -382,16 +382,16 @@ struct llama_v2_load_tensor {
        LLAMA_V2_ASSERT(shards.size() <= UINT32_MAX);
        uint32_t n_shards = (uint32_t) shards.size();
        switch (split_type) {
-            case SPLIT_NONE:
+            case SPLIT_NONE_2:
                ne = first_shard.ne;
                break;
-            case SPLIT_BY_COLUMNS:
-                ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
+            case SPLIT_BY_COLUMNS_2:
+                ne = {checked_mul2<uint32_t>(first_shard.ne[0], n_shards),
                      first_shard.ne[1]};
                break;
-            case SPLIT_BY_ROWS:
+            case SPLIT_BY_ROWS_2:
                ne = {first_shard.ne[0],
-                      checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
+                      checked_mul2<uint32_t>(first_shard.ne[1], n_shards)};
                break;
        }
    }
@ -737,11 +737,11 @@ struct llama_v2_model_loader {
        if (use_mmap) {
            LLAMA_V2_ASSERT(lt.shards.size() == 1);
            lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
-        } else if (lt.split_type == SPLIT_NONE) {
+        } else if (lt.split_type == SPLIT_NONE_2) {
            llama_v2_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
            file.seek(lt.shards.at(0).file_off, SEEK_SET);
            file.read_raw(lt.data, lt.size);
-        } else if (lt.split_type == SPLIT_BY_ROWS) {
+        } else if (lt.split_type == SPLIT_BY_ROWS_2) {
            size_t offset = 0;
            for (llama_v2_load_tensor_shard & shard : lt.shards) {
                llama_v2_file & file = file_loaders.at(shard.file_idx)->file;
@ -750,7 +750,7 @@ struct llama_v2_model_loader {
                offset += shard.size;
            }
            LLAMA_V2_ASSERT(offset == lt.size);
-        } else if (lt.split_type == SPLIT_BY_COLUMNS) {
+        } else if (lt.split_type == SPLIT_BY_COLUMNS_2) {
            // Let's load the data into temporary buffers to ensure the OS performs large loads.
            std::vector<llama_v2_buffer> tmp_bufs(lt.shards.size());
            for (size_t i = 0; i < lt.shards.size(); i++) {
@ -807,7 +807,7 @@ static bool kv_cache_init(
    const int64_t n_mem      = n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;

-    cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB_2);

    struct ggml_v2_init_params params;
    params.mem_size   = cache.buf.size;
@ -888,12 +888,12 @@ static const char *llama_v2_ftype_name(enum llama_v2_ftype ftype) {
    }
 }

-static const char *llama_v2_model_type_name(e_model type) {
+static const char *llama_v2_model_type_name(e_model2 type) {
    switch (type) {
-        case MODEL_7B: return "7B";
-        case MODEL_13B: return "13B";
-        case MODEL_30B: return "30B";
-        case MODEL_65B: return "65B";
+        case MODEL_7B_2: return "7B";
+        case MODEL_13B_2: return "13B";
+        case MODEL_30B_2: return "30B";
+        case MODEL_65B_2: return "65B";
        default: 
            printf("\nWARNING: NON-STANDARD LLAMA FILE DETECTED. DEFAULT TO 7B SIZE.\n");
            return "UNKNOWN";
@ -925,11 +925,11 @@ static void llama_v2_model_load_internal(

    {
        switch (hparams.n_layer) {
-            case 32: model.type = e_model::MODEL_7B; break;
-            case 40: model.type = e_model::MODEL_13B; break;
-            case 60: model.type = e_model::MODEL_30B; break;
-            case 80: model.type = e_model::MODEL_65B; break;
-            default: model.type = e_model::MODEL_UNKNOWN; break;
+            case 32: model.type = e_model2::MODEL_7B_2; break;
+            case 40: model.type = e_model2::MODEL_13B_2; break;
+            case 60: model.type = e_model2::MODEL_30B_2; break;
+            case 80: model.type = e_model2::MODEL_65B_2; break;
+            default: model.type = e_model2::MODEL_UNKNOWN_2; break;
        }

        hparams.n_ctx = n_ctx;
@ -985,13 +985,13 @@ static void llama_v2_model_load_internal(
        const size_t mem_required =
            ctx_size +
            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
+            MEM_REQ_SCRATCH0_2().at(model.type) +
+            MEM_REQ_SCRATCH1_2().at(model.type) +
+            MEM_REQ_EVAL_2().at(model.type);

        // this is the memory required by one llama_v2_state
        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
+            scale*MEM_REQ_KV_SELF_2().at(model.type);

        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1462,7 +1462,7 @@ static bool llama_v2_eval_internal(
 // tokenizer
 //

-static size_t utf8_len(char src) {
+static size_t utf8_len2(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
@ -1503,7 +1503,7 @@ struct llama_v2_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_v2_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = std::min(text.size() - offs, utf8_len2(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -2251,10 +2251,10 @@ struct llama_v2_context * llama_v2_init_from_file(
            ctx->embedding.resize(hparams.n_embd);
        }

-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL_2().at(ctx->model.type));

-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
-        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0_2().at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1_2().at(ctx->model.type));
    }

    return ctx;
@ -3077,5 +3077,15 @@ std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * c
    int n = legacy_llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    res.resize(n);

+    return res;
+}
+
+std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos) {
+    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+    std::vector<llama_token> res(text.size() + (int) add_bos);
+    const int n = llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    assert(n >= 0);
+    res.resize(n);
+
    return res;
 }