update license, added backwards compatibility with both ggml model formats, fixed context length issues.

2026-05-10 04:00:53 +00:00 · 2023-03-20 23:43:35 +08:00 · 2023-03-20 23:43:35 +08:00 · 8d39365af6
commit 8d39365af6
parent a2c10e0d2f
11 changed files with 807 additions and 15 deletions
--- a/extra.cpp
+++ b/extra.cpp
@ -0,0 +1,72 @@
+
+#include "extra.h"
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <regex>
+#include <iostream>
+#include <iterator>
+#include <queue>
+#include <string>
+#include <math.h>
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
+ #include <alloca.h>
+ #endif
+
+// TODO: Calculate this constant from the vocabulary
+#define MAX_TOKEN_LEN 18
+// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
+std::vector<gpt_vocab::id> legacy_llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
+    std::vector<gpt_vocab::id> res;
+    std::vector<int> score;
+    std::vector<gpt_vocab::id> prev;
+    int len = text.length();
+
+    score.resize(len + 1);
+    prev.resize(len + 1);
+
+    // Forward pass
+    for (int i = 0; i < len; i++) {
+        int max_len = std::min(len - i, MAX_TOKEN_LEN);
+        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
+            auto sub = text.substr(i, sub_len);
+            auto token = vocab.token_to_id.find(sub);
+            if (token != vocab.token_to_id.end()) {
+                int token_score = sub.length() * sub.length();
+                int local_score = score[i] + token_score;
+                int next = i + sub_len;
+                if (score[next] < local_score) {
+                    score[next] = local_score;
+                    prev[next] = (*token).second;
+                }
+            }
+        }
+    }
+
+    // Backward pass
+    int i = len;
+    while (i > 0) {
+        gpt_vocab::id token_id = prev[i];
+        if (token_id == 0) {
+	    // TODO: Return error or something more meaningful
+            printf("failed to tokenize string!\n");
+	    break;
+        }
+        res.push_back(token_id);
+        auto token = (*vocab.id_to_token.find(token_id)).second;
+        i -= token.length();
+    }
+
+    if (bos) {
+        res.push_back(1); // TODO: replace with vocab.bos
+    }
+
+    // Pieces are in reverse order so correct that
+    std::reverse(res.begin(), res.end());
+
+    return res;
+}