diff --git a/common/arg.cpp b/common/arg.cpp index 658b5cdea..8d2786be0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -131,17 +131,26 @@ std::string common_arg::to_string() { static void common_params_handle_model_default( std::string & model, - std::string & model_url, + const std::string & model_url, std::string & hf_repo, - std::string & hf_file) { + std::string & hf_file, + const std::string & hf_token) { if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (hf_file.empty()) { if (model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + auto auto_detected = common_get_hf_file(hf_repo, hf_token); + if (auto_detected.first.empty() || auto_detected.second.empty()) { + exit(1); // built without CURL, error message already printed + } + hf_repo = auto_detected.first; + hf_file = auto_detected.second; + } else { + hf_file = model; } - hf_file = model; - } else if (model.empty()) { + } + // make sure model path is present (for caching purposes) + if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename @@ -291,8 +300,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } // TODO: refactor model params in a common struct - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); - common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token); if (params.escape) { string_process_escapes(params.prompt); @@ -769,15 +778,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, - string_format( - "run in conversation mode:\n" - "- does not print special tokens and suffix/prefix\n" - "- interactive mode is also enabled\n" - "(default: %s)", - params.conversation ? "true" : "false" - ), + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: auto enabled if chat template is available)", [](common_params & params) { - params.conversation = true; + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-no-cnv", "--no-conversation"}, + "force disable conversation mode (default: false)", + [](common_params & params) { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( @@ -1584,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( - {"-hfr", "--hf-repo"}, "REPO", - "Hugging Face model repository (default: unused)", + {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", + "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "example: unsloth/phi-4-GGUF:q4_k_m\n" + "(default: unused)", [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file (default: unused)", + "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( - {"-hfrv", "--hf-repo-v"}, "REPO", + {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { params.vocoder.hf_repo = value; diff --git a/common/common.cpp b/common/common.cpp index 37389e357..46c21b53b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -75,6 +75,22 @@ #include #endif #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 + +// +// CURL utils +// + +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; #endif // LLAMA_USE_CURL using json = nlohmann::ordered_json; @@ -1132,7 +1148,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl - std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; if (!curl) { LOG_ERR("%s: error initializing libcurl\n", __func__); return false; @@ -1146,11 +1163,9 @@ static bool common_download_file(const std::string & url, const std::string & pa // Check if hf-token or bearer-token was specified if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer "; - auth_header += hf_token.c_str(); - struct curl_slist *http_headers = NULL; - http_headers = curl_slist_append(http_headers, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); } #if defined(_WIN32) @@ -1446,6 +1461,80 @@ struct llama_model * common_load_model_from_hf( return common_load_model_from_url(model_url, local_path, hf_token, params); } +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + // fetch model info from Hugging Face Hub API + json model_info; + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + std::string res_str; + std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + static_cast(data)->append((char * ) ptr, size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (!hf_token.empty()) { + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + throw std::runtime_error("error: cannot make GET request to HF API"); + } + + long res_code; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + if (res_code == 200) { + model_info = json::parse(res_str); + } else if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); + } + + // check response + if (!model_info.contains("ggufFile")) { + throw std::runtime_error("error: model does not have ggufFile"); + } + json & gguf_file = model_info.at("ggufFile"); + if (!gguf_file.contains("rfilename")) { + throw std::runtime_error("error: ggufFile does not have rfilename"); + } + + return std::make_pair(hf_repo, gguf_file.at("rfilename")); +} + #else struct llama_model * common_load_model_from_url( @@ -1467,6 +1556,11 @@ struct llama_model * common_load_model_from_hf( return nullptr; } +std::pair common_get_hf_file(const std::string &, const std::string &) { + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + return std::make_pair("", ""); +} + #endif // LLAMA_USE_CURL // @@ -1638,15 +1732,8 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto // std::string common_get_builtin_chat_template(const struct llama_model * model) { - static const char * template_key = "tokenizer.chat_template"; - // call with NULL buffer to get the total size of the string - int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0); - if (res > 0) { - std::vector model_template(res + 1, 0); - llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size()); - return std::string(model_template.data(), model_template.size() - 1); - } - return ""; + const char * ptr_tmpl = llama_model_chat_template(model); + return ptr_tmpl == nullptr ? "" : ptr_tmpl; } bool common_chat_verify_template(const std::string & tmpl) { diff --git a/common/common.h b/common/common.h index 5be11a3d4..1a976d36d 100644 --- a/common/common.h +++ b/common/common.h @@ -99,6 +99,12 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; +enum common_conversation_mode { + COMMON_CONVERSATION_MODE_DISABLED = 0, + COMMON_CONVERSATION_MODE_ENABLED = 1, + COMMON_CONVERSATION_MODE_AUTO = 2, +}; + // sampling parameters struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler @@ -273,7 +279,6 @@ struct common_params { bool special = false; // enable special token output bool interactive = false; // interactive mode bool interactive_first = false; // wait for user input immediately - bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -299,6 +304,8 @@ struct common_params { ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V + common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; + // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector // NOLINT std::vector image; // path to image file(s) @@ -452,6 +459,11 @@ static bool string_starts_with(const std::string & str, return str.rfind(prefix, 0) == 0; } +static bool string_ends_with(const std::string & str, + const std::string & suffix) { // While we wait for C++20's std::string::ends_with... + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); @@ -499,6 +511,9 @@ struct llama_model * common_load_model_from_hf( const std::string & local_path, const std::string & hf_token, const struct llama_model_params & params); +std::pair common_get_hf_file( + const std::string & hf_repo_with_tag, + const std::string & hf_token); // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh index d5a92d605..05a932227 100755 --- a/examples/gguf-split/tests.sh +++ b/examples/gguf-split/tests.sh @@ -41,7 +41,7 @@ echo PASS echo # 2b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -51,7 +51,7 @@ echo PASS echo # 3b. Test the merged model is loading properly -$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 echo PASS echo @@ -61,7 +61,7 @@ echo PASS echo # 4b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 echo PASS echo @@ -71,7 +71,7 @@ echo #echo # 5b. Test the merged model is loading properly -#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 +#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #echo PASS #echo @@ -81,7 +81,7 @@ echo PASS echo # 6b. Test the sharded model is loading properly -$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 echo PASS echo diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 476553af8..57ecb57c8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,6 +31,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant"; + static llama_context ** g_ctx; static llama_model ** g_model; static common_sampler ** g_smpl; @@ -205,8 +207,24 @@ int main(int argc, char ** argv) { LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } + // auto enable conversation mode if chat template is available + const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty(); + if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { + if (has_chat_template) { + LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } else { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + } + } + + // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning + if (params.conversation_mode && !has_chat_template) { + LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); + } + // print chat template example in conversation mode - if (params.conversation) { + if (params.conversation_mode) { if (params.enable_chat_template) { LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str()); } else { @@ -253,8 +271,10 @@ int main(int argc, char ** argv) { std::vector embd_inp; { - auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) - ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode + auto prompt = (params.conversation_mode && params.enable_chat_template) + // format the system prompt in conversation mode (fallback to default if empty) + ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt) + // otherwise use the prompt as is : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { LOG_DBG("tokenize the prompt\n"); @@ -328,7 +348,7 @@ int main(int argc, char ** argv) { params.n_keep += add_bos; // always keep the BOS token } - if (params.conversation) { + if (params.conversation_mode) { params.interactive_first = true; } @@ -452,7 +472,11 @@ int main(int argc, char ** argv) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) LOG_INF( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_INF( "%s\n", control_message); + LOG_INF( "%s", control_message); + if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) { + LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n"); + } + LOG_INF("\n"); is_interacting = params.interactive_first; } @@ -764,7 +788,7 @@ int main(int argc, char ** argv) { } // if current token is not EOG, we add it to current assistant message - if (params.conversation) { + if (params.conversation_mode) { const auto id = common_sampler_last(smpl); assistant_ss << common_token_to_piece(ctx, id, false); } @@ -772,7 +796,7 @@ int main(int argc, char ** argv) { if (n_past > 0 && is_interacting) { LOG_DBG("waiting for user input\n"); - if (params.conversation) { + if (params.conversation_mode) { LOG("\n> "); } @@ -782,7 +806,7 @@ int main(int argc, char ** argv) { } std::string buffer; - if (!params.input_prefix.empty() && !params.conversation) { + if (!params.input_prefix.empty() && !params.conversation_mode) { LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG("%s", params.input_prefix.c_str()); } @@ -806,7 +830,7 @@ int main(int argc, char ** argv) { // Entering a empty line lets the user pass control back if (buffer.length() > 1) { // append input suffix if any - if (!params.input_suffix.empty() && !params.conversation) { + if (!params.input_suffix.empty() && !params.conversation_mode) { LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG("%s", params.input_suffix.c_str()); } @@ -819,7 +843,7 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } - bool format_chat = params.conversation && params.enable_chat_template; + bool format_chat = params.conversation_mode && params.enable_chat_template; std::string user_inp = format_chat ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh index 24bc970e8..70f7610f9 100644 --- a/examples/quantize/tests.sh +++ b/examples/quantize/tests.sh @@ -47,7 +47,7 @@ echo PASS echo # 3a. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 echo PASS echo @@ -57,7 +57,7 @@ echo PASS echo # 4b. Test the requanted model is loading properly -$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 +$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 echo PASS echo diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 3640a7a6c..26f3583bd 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index 86a79b77f..2180ef4ad 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -37,7 +37,7 @@
+ }" @click="setViewingConv(conv.id)" dir="auto"> {{ conv.messages[0].content }}
@@ -156,6 +156,7 @@ @keydown.enter.shift.exact.prevent="inputMsg += '\n'" :disabled="isGenerating" id="msg-input" + dir="auto" > @@ -248,6 +249,7 @@