Merge commit 'ad3a0505e3' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/close-issue.yml # .github/workflows/code-coverage.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # .github/workflows/nix-ci-aarch64.yml # .github/workflows/nix-ci.yml # .github/workflows/python-check-requirements.yml # .github/workflows/python-lint.yml # .github/workflows/server.yml # .github/workflows/zig-build.yml # .gitignore # CMakeLists.txt # Makefile # README-sycl.md # README.md # build.zig # common/CMakeLists.txt # llama.cpp # tests/CMakeLists.txt # tests/test-backend-ops.cpp
2025-09-11 01:24:36 +00:00 · 2024-04-06 18:32:57 +08:00 · 2024-04-06 18:32:57 +08:00 · 9c0fbf9f73
commit 9c0fbf9f73
parent c348223dff ad3a0505e3
67 changed files with 10861 additions and 4661 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -40,6 +40,9 @@
 #endif
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
+#include <curl/easy.h>
+#include <thread>
+#include <future>
 #endif

 #if defined(_MSC_VER)
@ -62,7 +65,7 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

@ -102,7 +105,7 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-void process_escapes(std::string& input) {
+void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;

@ -155,8 +158,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

-static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) {
-    std::string arg = argv[i];
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
    llama_sampling_params& sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
@ -649,14 +651,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        params.model = argv[i];
        return true;
    }
-    if (arg == "-mu" || arg == "--model-url") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.model_url = argv[i];
-        return true;
-    }
    if (arg == "-md" || arg == "--model-draft") {
        if (++i >= argc) {
            invalid_param = true;
@ -673,6 +667,30 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        params.model_alias = argv[i];
        return true;
    }
+    if (arg == "-mu" || arg == "--model-url") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.model_url = argv[i];
+        return true;
+    }
+    if (arg == "-hfr" || arg == "--hf-repo") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hf_repo = argv[i];
+        return true;
+    }
+    if (arg == "-hff" || arg == "--hf-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hf_file = argv[i];
+        return true;
+    }
    if (arg == "--lora") {
        if (++i >= argc) {
            invalid_param = true;
@ -949,6 +967,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        }
        return true;
    }
+    if (arg == "-lcs" || arg == "--lookup-cache-static") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.lookup_cache_static = argv[i];
+        return true;
+    }
+    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.lookup_cache_dynamic = argv[i];
+        return true;
+    }
    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
        if (++i >= argc) {
            invalid_param = true;
@ -1202,13 +1236,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }

-        if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) {
+        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
+
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }
+
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
@ -1216,6 +1252,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

+    // short-hand to avoid specifying --hf-file -> default it to --model
+    if (!params.hf_repo.empty() && params.hf_file.empty()) {
+        params.hf_file = params.model;
+    }
+
    if (params.escape) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
@ -1405,12 +1446,20 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                        model download url (default: %s)\n", params.model_url.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding\n");
+    printf("                        draft model for speculative decoding (default: unused)\n");
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                        model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                        Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                        Hugging Face model file (default: unused)\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
+    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
@ -1591,6 +1640,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
    if (s == "q4_1") {
        return GGML_TYPE_Q4_1;
    }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }
    if (s == "q5_0") {
        return GGML_TYPE_Q5_0;
    }
@ -1654,25 +1706,13 @@ void llama_batch_add(

 #ifdef LLAMA_USE_CURL

-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
-                                              struct llama_model_params params) {
-    // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
-        fprintf(stderr, "%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    // Initialize libcurl globally
-    auto curl = curl_easy_init();
-
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+    bool force_download = false;

    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, model_url);
+    curl_easy_setopt(curl, CURLOPT_URL, url);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@ -1681,16 +1721,16 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha

    // Check if the file already exists locally
    struct stat model_file_info;
-    auto file_exists = (stat(path_model, &model_file_info) == 0);
+    auto file_exists = (stat(path, &model_file_info) == 0);

    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
+    char etag_path[PATH_MAX] = {0};
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);

    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
+    char last_modified_path[PATH_MAX] = {0};
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);

    if (file_exists) {
        auto * f_etag = fopen(etag_path, "r");
@ -1698,7 +1738,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (!fgets(etag, sizeof(etag), f_etag)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
            } else {
-                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
            }
            fclose(f_etag);
        }
@ -1708,7 +1748,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
            } else {
-                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
                        last_modified);
            }
            fclose(f_last_modified);
@ -1726,6 +1766,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

+            // Convert header field name to lowercase
+            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+                buffer[i] = tolower(buffer[i]);
+            }
+
            const char * etag_prefix = "etag: ";
            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@ -1748,7 +1793,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        if (res != CURLE_OK) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return NULL;
+            return false;
        }

        long http_code = 0;
@ -1756,30 +1801,34 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
-            file_exists = false;
+            force_download = true;
            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }

    // If the ETag or the Last-Modified headers are different: trigger a new download
-    if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
-        char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-        snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+    bool should_download = !file_exists
+        || force_download
+        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+    if (should_download) {
+        char path_temporary[PATH_MAX] = {0};
+        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
        if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
-            if (remove(path_model) != 0) {
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+            if (remove(path) != 0) {
                curl_easy_cleanup(curl);
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
-                return NULL;
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+                return false;
            }
        }

        // Set the output file
-        auto * outfile = fopen(path_model_temporary, "wb");
+        auto * outfile = fopen(path_temporary, "wb");
        if (!outfile) {
            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
-            return NULL;
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+            return false;
        }

        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@ -1793,15 +1842,30 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        //  display download progress
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);

+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
        // start the download
-        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                model_url, path_model, headers.etag, headers.last_modified);
+        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
        auto res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return NULL;
+            return false;
        }

        long http_code = 0;
@ -1810,7 +1874,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
-            return NULL;
+            return false;
        }

        // Clean up
@ -1822,7 +1886,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (etag_file) {
                fputs(headers.etag, etag_file);
                fclose(etag_file);
-                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
+                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
            }
        }

@ -1832,42 +1896,177 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (last_modified_file) {
                fputs(headers.last_modified, last_modified_file);
                fclose(last_modified_file);
-                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
                        headers.last_modified);
            }
        }

-        if (rename(path_model_temporary, path_model) != 0) {
+        if (rename(path_temporary, path) != 0) {
+            curl_easy_cleanup(curl);
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct llama_model * llama_load_model_from_url(
+        const char * model_url,
+        const char * path_model,
+        const struct llama_model_params & params) {
+    // Basic validation of the model_url
+    if (!model_url || strlen(model_url) == 0) {
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    // Initialize libcurl
+    auto * curl = curl_easy_init();
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }
+
+    if (!llama_download_file(curl, model_url, path_model)) {
+        return NULL;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+        if (!ctx_gguf) {
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
            return NULL;
        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT_STR);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
    }

    curl_easy_cleanup(curl);

+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+                fprintf(stderr, "\n%s: unexpected model file name: %s"
+                                " n_split=%d\n", __func__, path_model, n_split);
+                return NULL;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+                fprintf(stderr, "\n%s: unexpected model url: %s"
+                                " n_split=%d\n", __func__, model_url, n_split);
+                return NULL;
+            }
+        }
+
+        // Prepare download in parallel
+        std::vector<std::future<bool>> futures_download;
+        for (int idx = 1; idx < n_split; idx++) {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+                char split_path[PATH_MAX] = {0};
+                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+                auto * curl = curl_easy_init();
+                bool res = llama_download_file(curl, split_url, split_path);
+                curl_easy_cleanup(curl);
+
+                return res;
+            }, idx));
+        }
+
+        // Wait for all downloads to complete
+        for (auto & f : futures_download) {
+            if (!f.get()) {
+                return NULL;
+            }
+        }
+    }
+
    return llama_load_model_from_file(path_model, params);
 }

+struct llama_model * llama_load_model_from_hf(
+        const char * repo,
+        const char * model,
+        const char * path_model,
+        const struct llama_model_params & params) {
+    // construct hugging face model url:
+    //
+    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //
+    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //
+
+    std::string model_url = "https://huggingface.co/";
+    model_url += repo;
+    model_url += "/resolve/main/";
+    model_url += model;
+
+    return llama_load_model_from_url(model_url.c_str(), path_model, params);
+}
+
 #else

-struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
-                                              struct llama_model_params /*params*/) {
+struct llama_model * llama_load_model_from_url(
+        const char * /*model_url*/,
+        const char * /*path_model*/,
+        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
 }

+struct llama_model * llama_load_model_from_hf(
+        const char * /*repo*/,
+        const char * /*model*/,
+        const char * /*path_model*/,
+        const struct llama_model_params & /*params*/) {
+    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return nullptr;
+}
+
 #endif // LLAMA_USE_CURL

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);

    llama_model * model = nullptr;
-    if (!params.model_url.empty()) {
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+    } else if (!params.model_url.empty()) {
        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
+
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
@ -1907,7 +2106,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }

    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
+        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
        float lora_scale = std::get<1>(params.lora_adapter[i]);
        int err = llama_model_apply_lora_from_file(model,
                                             lora_adapter.c_str(),