diff --git a/common/arg.cpp b/common/arg.cpp index a1cd5830f..922391069 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,15 +1,17 @@ #include "arg.h" +#include "log.h" #include "sampling.h" #include -#include -#include -#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include #include "json-schema-to-grammar.h" @@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, exit(0); } )); - add_opt(llama_arg( - {"-v", "--verbose"}, - "print verbose information", - [](gpt_params & params) { - params.verbosity = 1; - } - )); - add_opt(llama_arg( - {"--verbosity"}, "N", - format("set specific verbosity level (default: %d)", params.verbosity), - [](gpt_params & params, int value) { - params.verbosity = value; - } - )); add_opt(llama_arg( {"--verbose-prompt"}, format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), @@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params) { params.use_color = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(llama_arg( {"-t", "--threads"}, "N", format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), @@ -697,6 +685,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.n_keep = value; } )); + add_opt(llama_arg( + {"--no-context-shift"}, + format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](gpt_params & params) { + params.ctx_shift = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -876,7 +871,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.input_prefix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", @@ -884,7 +879,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.input_suffix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--no-warmup"}, "skip warming up the model with an empty run", @@ -1317,7 +1312,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, int value) { params.n_parallel = value; } - )); + ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), @@ -1824,19 +1819,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.system_prompt = system_prompt; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--log-format"}, "{text, json}", - "log output format: json or text (default: json)", - [](gpt_params & params, const std::string & value) { - if (value == "json") { - params.log_json = true; - } else if (value == "text") { - params.log_json = false; - } else { - throw std::invalid_argument("invalid value"); - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--metrics"}, format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), @@ -1956,40 +1938,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); -#ifndef LOG_DISABLE_LOGS - // TODO: make this looks less weird - add_opt(llama_arg( - {"--log-test"}, - "Log test", - [](gpt_params &) { log_param_single_parse("--log-test"); } - )); add_opt(llama_arg( {"--log-disable"}, "Log disable", - [](gpt_params &) { log_param_single_parse("--log-disable"); } - )); - add_opt(llama_arg( - {"--log-enable"}, - "Log enable", - [](gpt_params &) { log_param_single_parse("--log-enable"); } - )); - add_opt(llama_arg( - {"--log-new"}, - "Log new", - [](gpt_params &) { log_param_single_parse("--log-new"); } - )); - add_opt(llama_arg( - {"--log-append"}, - "Log append", - [](gpt_params &) { log_param_single_parse("--log-append"); } + [](gpt_params &) { + gpt_log_pause(gpt_log_main()); + } )); add_opt(llama_arg( {"--log-file"}, "FNAME", - "Log file", - [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + "Log to file", + [](gpt_params &, const std::string & value) { + gpt_log_set_file(gpt_log_main(), value.c_str()); + } )); -#endif // LOG_DISABLE_LOGS + add_opt(llama_arg( + {"--log-colors"}, + "Enable colored logging", + [](gpt_params &) { + gpt_log_set_colors(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_COLORS")); + add_opt(llama_arg( + {"-v", "--verbose", "--log-verbose"}, + "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", + [](gpt_params & params) { + params.verbosity = INT_MAX; + gpt_log_set_verbosity_thold(INT_MAX); + } + )); + add_opt(llama_arg( + {"-lv", "--verbosity", "--log-verbosity"}, "N", + "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", + [](gpt_params & params, int value) { + params.verbosity = value; + gpt_log_set_verbosity_thold(value); + } + ).set_env("LLAMA_LOG_VERBOSITY")); + add_opt(llama_arg( + {"--log-prefix"}, + "Enable prefx in log messages", + [](gpt_params &) { + gpt_log_set_prefix(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_PREFIX")); + add_opt(llama_arg( + {"--log-timestamps"}, + "Enable timestamps in log messages", + [](gpt_params &) { + gpt_log_set_timestamps(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_TIMESTAMPS")); return ctx_arg; } - diff --git a/common/common.cpp b/common/common.cpp index 60b81df72..897efb458 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3,7 +3,9 @@ #endif #include "common.h" +#include "log.h" #include "build-info.h" +#include "log.cpp" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" @@ -26,6 +28,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -49,7 +52,6 @@ #if defined(LLAMA_USE_CURL) #include #include -#include #include #endif @@ -227,7 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!SetPriorityClass(GetCurrentProcess(), p)) { - fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); + LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); return false; } @@ -252,7 +254,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!setpriority(PRIO_PROCESS, 0, p)) { - fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); + LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); return false; } return true; @@ -285,14 +287,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) if (n_set && n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. - fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); } } bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { - fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + LOG_ERR("Format of CPU range is invalid! Expected []-[].\n"); return false; } @@ -304,7 +306,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { start_i = std::stoull(range.substr(0, dash_loc)); if (start_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "Start index out of bounds!\n"); + LOG_ERR("Start index out of bounds!\n"); return false; } } @@ -314,7 +316,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { end_i = std::stoull(range.substr(dash_loc + 1)); if (end_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "End index out of bounds!\n"); + LOG_ERR("End index out of bounds!\n"); return false; } } @@ -349,7 +351,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD } else if (c >= 'A' && c <= 'F') { id -= 'A' - 10; } else { - fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i)); return false; } @@ -362,6 +364,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } +void gpt_init() { + llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { + if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { + gpt_log_add(gpt_log_main(), level, "%s", text); + } + }, NULL); + +#ifdef NDEBUG + const char * build_type = ""; +#else + const char * build_type = " (debug)"; +#endif + + LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); +} + std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; @@ -442,6 +460,94 @@ void string_replace_all(std::string & s, const std::string & search, const std:: s = std::move(builder); } +std::string string_from(bool value) { + return value ? "true" : "false"; +} + +std::string string_from(const std::vector & values) { + std::stringstream buf; + + buf << "[ "; + bool first = true; + for (auto e : values) { + if (first) { + first = false; + } else { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (const auto & token : tokens) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, token); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "'" << detokenized << "'" + << ":" << std::to_string(token); + } + + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (int i = 0; i < batch.n_tokens; ++i) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, batch.token[i]); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "\n" << std::to_string(i) + << ":token '" << detokenized << "'" + << ":pos " << std::to_string(batch.pos[i]) + << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ":seq_id " << std::to_string(batch.seq_id[i][0]) + << ":logits " << std::to_string(batch.logits[i]); + } + + buf << " ]"; + + return buf.str(); +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -482,7 +588,7 @@ void string_process_escapes(std::string & input) { bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); return false; } llama_model_kv_override kvo; @@ -505,20 +611,20 @@ bool string_parse_kv_override(const char * data, std::vector 127) { - fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; } strncpy(kvo.val_str, sep, 127); kvo.val_str[127] = '\0'; } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data); return false; } overrides.emplace_back(std::move(kvo)); @@ -730,7 +836,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); return iparams; } @@ -738,7 +844,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return iparams; } @@ -774,7 +880,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { loaded_la.scale = la.scale; loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); if (loaded_la.adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return iparams; @@ -786,12 +892,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (params.sparams.ignore_eos && llama_token_eos(model) == -1) { - fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); + LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); params.sparams.ignore_eos = false; } if (params.warmup) { - LOG("warming up the model with an empty run\n"); + LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); std::vector tmp; llama_token bos = llama_token_bos(model); @@ -956,7 +1062,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ int remaining_attempts = max_attempts; while (remaining_attempts > 0) { - fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); CURLcode res = curl_easy_perform(curl); if (res == CURLE_OK) { @@ -964,13 +1070,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ } int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; - fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); remaining_attempts--; std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); } - fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + return false; } @@ -979,7 +1086,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); + LOG_ERR("%s: error initializing libcurl\n", __func__); return false; } @@ -1020,11 +1127,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat if (metadata_in.good()) { try { metadata_in >> metadata; - fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); if (metadata.contains("url") && metadata.at("url").is_string()) { auto previous_url = metadata.at("url").get(); if (previous_url != url) { - fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); return false; } } @@ -1035,12 +1142,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { - fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); return false; } } } else { - fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); + LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers @@ -1088,26 +1195,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat // HEAD not supported, we don't know if the file has changed // force trigger downloading force_download = true; - fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } bool should_download = !file_exists || force_download; if (!should_download) { if (!etag.empty() && etag != headers.etag) { - fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); should_download = true; } else if (!last_modified.empty() && last_modified != headers.last_modified) { - fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); should_download = true; } } if (should_download) { std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); if (remove(path.c_str()) != 0) { - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); return false; } } @@ -1122,7 +1229,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); + LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; } @@ -1153,7 +1260,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat }; // start the download - fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); if (!was_perform_successful) { @@ -1163,7 +1270,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat long http_code = 0; curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); return false; } @@ -1177,10 +1284,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat {"lastModified", headers.last_modified} }); std::ofstream(metadata_path) << metadata.dump(4); - fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); if (rename(path_temporary.c_str(), path.c_str()) != 0) { - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } } @@ -1195,7 +1302,7 @@ struct llama_model * llama_load_model_from_url( const struct llama_model_params & params) { // Basic validation of the model_url if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); + LOG_ERR("%s: invalid model_url\n", __func__); return NULL; } @@ -1212,7 +1319,7 @@ struct llama_model * llama_load_model_from_url( }; auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); if (!ctx_gguf) { - fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); return NULL; } @@ -1232,14 +1339,12 @@ struct llama_model * llama_load_model_from_url( // and extract split URL and PATH prefixes { if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model file name: %s" - " n_split=%d\n", __func__, path_model, n_split); + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); return NULL; } if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model url: %s" - " n_split=%d\n", __func__, model_url, n_split); + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); return NULL; } } @@ -1299,7 +1404,7 @@ struct llama_model * llama_load_model_from_url( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } @@ -1309,7 +1414,7 @@ struct llama_model * llama_load_model_from_hf( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; } @@ -1637,13 +1742,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); return result; } int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { - fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); } for (int i = 0; i < n_tensors; i++) { @@ -1661,23 +1766,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } if (layer_idx < 0) { - fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } else if (layer_idx == 0) { - fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); if (tensor->type != GGML_TYPE_F32) { - fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } if (ggml_n_dims(tensor) != 1) { - fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -1685,7 +1790,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor); } else if (ggml_nelements(tensor) != result.n_embd) { - fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -1702,7 +1807,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } if (result.n_embd == -1) { - fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); result.data.clear(); } @@ -1723,7 +1828,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector #include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -244,6 +242,7 @@ struct gpt_params { bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention bool no_perf = false; // disable performance metrics + bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch @@ -339,6 +338,10 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; +// call once at the start of a program if it uses libcommon +// initializes the logging system and prints info about the build +void gpt_init(); + std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); @@ -374,6 +377,11 @@ static std::vector string_split(const std::string & str, char delim) { bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); +std::string string_from(bool value); +std::string string_from(const std::vector & values); +std::string string_from(const struct llama_context * ctx, const std::vector & tokens); +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); + // // Filesystem utils // diff --git a/common/log.cpp b/common/log.cpp new file mode 100644 index 000000000..2825a227e --- /dev/null +++ b/common/log.cpp @@ -0,0 +1,401 @@ +#include "log.h" + +#include +#include +#include +#include +#include +#include +#include + +int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; + +void gpt_log_set_verbosity_thold(int verbosity) { + gpt_log_verbosity_thold = verbosity; +} + +#define LOG_COL_DEFAULT "\033[0m" +#define LOG_COL_BOLD "\033[1m" +#define LOG_COL_RED "\033[31m" +#define LOG_COL_GREEN "\033[32m" +#define LOG_COL_YELLOW "\033[33m" +#define LOG_COL_BLUE "\033[34m" +#define LOG_COL_MAGENTA "\033[35m" +#define LOG_COL_CYAN "\033[36m" +#define LOG_COL_WHITE "\033[37m" + +static int64_t t_us() { + return std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); +} + +// colors +enum gpt_log_col : int { + GPT_LOG_COL_DEFAULT = 0, + GPT_LOG_COL_BOLD, + GPT_LOG_COL_RED, + GPT_LOG_COL_GREEN, + GPT_LOG_COL_YELLOW, + GPT_LOG_COL_BLUE, + GPT_LOG_COL_MAGENTA, + GPT_LOG_COL_CYAN, + GPT_LOG_COL_WHITE, +}; + +// disable colors by default +static std::vector g_col = { + "", + "", + "", + "", + "", + "", + "", + "", + "", +}; + +struct gpt_log_entry { + enum ggml_log_level level; + + bool prefix; + + int64_t timestamp; + + std::vector msg; + + // signals the worker thread to stop + bool is_end; + + void print(FILE * file = nullptr) const { + FILE * fcur = file; + if (!fcur) { + // stderr displays DBG messages only when their verbosity level is not higher than the threshold + // these messages will still be logged to a file + if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + return; + } + + fcur = stdout; + + if (level != GGML_LOG_LEVEL_NONE) { + fcur = stderr; + } + } + + if (level != GGML_LOG_LEVEL_NONE && prefix) { + if (timestamp) { + // [M.s.ms.us] + fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", + g_col[GPT_LOG_COL_BLUE], + (int) (timestamp / 1000000 / 60), + (int) (timestamp / 1000000 % 60), + (int) (timestamp / 1000 % 1000), + (int) (timestamp % 1000), + g_col[GPT_LOG_COL_DEFAULT]); + } + + switch (level) { + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + default: + break; + } + } + + fprintf(fcur, "%s", msg.data()); + + if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { + fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + } + + fflush(fcur); + } +}; + +struct gpt_log { + // default capacity - will be expanded if needed + gpt_log() : gpt_log(256) {} + + gpt_log(size_t capacity) { + file = nullptr; + prefix = false; + timestamps = false; + running = false; + t_start = t_us(); + + // initial message size - will be expanded if longer messages arrive + entries.resize(capacity); + for (auto & entry : entries) { + entry.msg.resize(256); + } + + head = 0; + tail = 0; + + resume(); + } + + ~gpt_log() { + pause(); + if (file) { + fclose(file); + } + } + +private: + std::mutex mtx; + std::thread thrd; + std::condition_variable cv; + + FILE * file; + + bool prefix; + bool timestamps; + bool running; + + int64_t t_start; + + // ring buffer of entries + std::vector entries; + size_t head; + size_t tail; + + // worker thread copies into this + gpt_log_entry cur; + +public: + void add(enum ggml_log_level level, const char * fmt, va_list args) { + std::lock_guard lock(mtx); + + if (!running) { + // discard messages while the worker thread is paused + return; + } + + auto & entry = entries[tail]; + + { + // cannot use args twice, so make a copy in case we need to expand the buffer + va_list args_copy; + va_copy(args_copy, args); + +#if 1 + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy); + } +#else + // hack for bolding arguments + + std::stringstream ss; + for (int i = 0; fmt[i] != 0; i++) { + if (fmt[i] == '%') { + ss << LOG_COL_BOLD; + while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++]; + ss << LOG_COL_DEFAULT; + if (fmt[i] == 0) break; + } + ss << fmt[i]; + } + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy); + } +#endif + } + + entry.level = level; + entry.prefix = prefix; + entry.timestamp = 0; + if (timestamps) { + entry.timestamp = t_us() - t_start; + } + entry.is_end = false; + + tail = (tail + 1) % entries.size(); + if (tail == head) { + // expand the buffer + std::vector new_entries(2*entries.size()); + + size_t new_tail = 0; + + do { + new_entries[new_tail] = std::move(entries[head]); + + head = (head + 1) % entries.size(); + new_tail = (new_tail + 1); + } while (head != tail); + + head = 0; + tail = new_tail; + + for (size_t i = tail; i < new_entries.size(); i++) { + new_entries[i].msg.resize(256); + } + + entries = std::move(new_entries); + } + + cv.notify_one(); + } + + void resume() { + std::lock_guard lock(mtx); + + if (running) { + return; + } + + running = true; + + thrd = std::thread([this]() { + while (true) { + { + std::unique_lock lock(mtx); + cv.wait(lock, [this]() { return head != tail; }); + + cur = entries[head]; + + head = (head + 1) % entries.size(); + } + + if (cur.is_end) { + break; + } + + cur.print(); // stdout and stderr + + if (file) { + cur.print(file); + } + } + }); + } + + void pause() { + { + std::lock_guard lock(mtx); + + if (!running) { + return; + } + + running = false; + + // push an entry to signal the worker thread to stop + { + auto & entry = entries[tail]; + entry.is_end = true; + + tail = (tail + 1) % entries.size(); + } + + cv.notify_one(); + } + + thrd.join(); + } + + void set_file(const char * path) { + pause(); + + if (file) { + fclose(file); + } + + if (path) { + file = fopen(path, "w"); + } else { + file = nullptr; + } + + resume(); + } + + void set_colors(bool colors) { + pause(); + + if (colors) { + g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[GPT_LOG_COL_RED] = LOG_COL_RED; + g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + } else { + for (size_t i = 0; i < g_col.size(); i++) { + g_col[i] = ""; + } + } + + resume(); + } + + void set_prefix(bool prefix) { + std::lock_guard lock(mtx); + + this->prefix = prefix; + } + + void set_timestamps(bool timestamps) { + std::lock_guard lock(mtx); + + this->timestamps = timestamps; + } +}; + +// +// public API +// + +struct gpt_log * gpt_log_init() { + return new gpt_log; +} + +struct gpt_log * gpt_log_main() { + static struct gpt_log log; + + return &log; +} + +void gpt_log_pause(struct gpt_log * log) { + log->pause(); +} + +void gpt_log_resume(struct gpt_log * log) { + log->resume(); +} + +void gpt_log_free(struct gpt_log * log) { + delete log; +} + +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + log->add(level, fmt, args); + va_end(args); +} + +void gpt_log_set_file(struct gpt_log * log, const char * file) { + log->set_file(file); +} + +void gpt_log_set_colors(struct gpt_log * log, bool colors) { + log->set_colors(colors); +} + +void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { + log->set_prefix(prefix); +} + +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { + log->set_timestamps(timestamps); +} diff --git a/common/log.h b/common/log.h index 1bc5328ce..d13f72d89 100644 --- a/common/log.h +++ b/common/log.h @@ -1,724 +1,90 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "ggml.h" // for ggml_log_level -// -------------------------------- -// -// Basic usage: -// -// -------- -// -// The LOG() and LOG_TEE() macros are ready to go by default -// they do not require any initialization. -// -// LOGLN() and LOG_TEELN() are variants which automatically -// include \n character at the end of the log string. -// -// LOG() behaves exactly like printf, by default writing to a logfile. -// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). -// -// Default logfile is named -// "llama..log" -// Default LOG_TEE() secondary output target is -// stderr -// -// Logs can be dynamically disabled or enabled using functions: -// log_disable() -// and -// log_enable() -// -// A log target can be changed with: -// log_set_target( string ) -// creating and opening, or re-opening a file by string filename -// or -// log_set_target( FILE* ) -// allowing to point at stderr, stdout, or any valid FILE* file handler. -// -// -------- -// -// End of Basic usage. -// -// -------------------------------- - -// Specifies a log target. -// default uses log_handler() with "llama.log" log file -// this can be changed, by defining LOG_TARGET -// like so: -// -// #define LOG_TARGET (a valid FILE*) -// #include "log.h" -// -// or it can be simply redirected to stdout or stderr -// like so: -// -// #define LOG_TARGET stderr -// #include "log.h" -// -// The log target can also be redirected to a different function -// like so: -// -// #define LOG_TARGET log_handler_different() -// #include "log.h" -// -// FILE* log_handler_different() -// { -// return stderr; -// } -// -// or: -// -// #define LOG_TARGET log_handler_another_one("somelog.log") -// #include "log.h" -// -// FILE* log_handler_another_one(char*filename) -// { -// static FILE* logfile = nullptr; -// (...) -// if( !logfile ) -// { -// fopen(...) -// } -// (...) -// return logfile -// } -// -#ifndef LOG_TARGET - #define LOG_TARGET log_handler() -#endif - -#ifndef LOG_TEE_TARGET - #define LOG_TEE_TARGET stderr -#endif - -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - -// Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} - -// Utility function for generating log file names with unique id based on thread id. -// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" -// where the number is a runtime id of the current thread. - -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) - -// INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) -{ - static bool _multilog = false; - - if (multilog != LogTriStateSame) - { - _multilog = multilog == LogTriStateTrue; - } - - std::stringstream buf; - - buf << log_file_basename; - if (_multilog) - { - buf << "."; - buf << log_get_pid(); - } - buf << "."; - buf << log_file_extension; - - return buf.str(); -} - -#ifndef LOG_DEFAULT_FILE_NAME - #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") -#endif - -// Utility for turning #define values into string literals -// so we can have a define for stderr and -// we can print "stderr" instead of literal stderr, etc. -#define LOG_STRINGIZE1(s) #s -#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) - -#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) - -// Allows disabling timestamps. -// in order to disable, define LOG_NO_TIMESTAMPS -// like so: -// -// #define LOG_NO_TIMESTAMPS -// #include "log.h" -// -#ifndef LOG_NO_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif +#ifndef __GNUC__ +# define LOG_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else - #define LOG_TIMESTAMP_FMT "%s" - #define LOG_TIMESTAMP_VAL ,"" +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#ifdef LOG_TEE_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif -#else - #define LOG_TEE_TIMESTAMP_FMT "%s" - #define LOG_TEE_TIMESTAMP_VAL ,"" -#endif +#define LOG_DEFAULT_DEBUG 1 +#define LOG_DEFAULT_LLAMA 0 -// Allows disabling file/line/function prefix -// in order to disable, define LOG_NO_FILE_LINE_FUNCTION -// like so: +// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower +// set via gpt_log_set_verbosity() +extern int gpt_log_verbosity_thold; + +void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe + +// the gpt_log uses an internal worker thread to print/write log messages +// when the worker thread is paused, incoming log messages are discarded +struct gpt_log; + +struct gpt_log * gpt_log_init(); +struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit +void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe +void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe +void gpt_log_free (struct gpt_log * log); + +LOG_ATTRIBUTE_FORMAT(3, 4) +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); + +// defaults: file = NULL, colors = false, prefix = false, timestamps = false // -// #define LOG_NO_FILE_LINE_FUNCTION -// #include "log.h" +// regular log output: // -#ifndef LOG_NO_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_FLF_FMT "%s" - #define LOG_FLF_VAL ,"" -#endif - -#ifdef LOG_TEE_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_TEE_FLF_FMT "%s" - #define LOG_TEE_FLF_VAL ,"" -#endif - -// INTERNAL, DO NOT USE -// USE LOG() INSTEAD +// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// llm_load_tensors: ggml ctx size = 0.27 MiB +// llm_load_tensors: offloading 32 repeating layers to GPU +// llm_load_tensors: offloading non-repeating layers to GPU // -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ +// with prefix = true, timestamps = true, the log output will look like this: +// +// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB +// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU +// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU +// +// I - info (stdout, V = 0) +// W - warning (stderr, V = 0) +// E - error (stderr, V = 0) +// D - debug (stderr, V = LOG_DEFAULT_DEBUG) +// + +void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe +void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe +void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix + +// helper macros for logging +// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold +// +// for example: +// +// LOG_DBG("this is a debug message: %d\n", expensive_function()); +// +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// + +#define LOG_TMPL(level, verbosity, ...) \ + do { \ + if ((verbosity) <= gpt_log_verbosity_thold) { \ + gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + } \ } while (0) -#else - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - } while (0) -#endif -// INTERNAL, DO NOT USE -// USE LOG_TEE() INSTEAD -// -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#else - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#endif +#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) +#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) -// The '\0' as a last argument, is a trick to bypass the silly -// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" -// so we can have a single macro which can be called just like printf. +#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) +#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) +#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) +#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) -// Main LOG macro. -// behaves like printf, and supports arguments the exact same way. -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG(...) LOG_IMPL(__VA_ARGS__, "") -#else - #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// Main TEE macro. -// does the same as LOG -// and -// simultaneously writes stderr. -// -// Secondary target can be changed just like LOG_TARGET -// by defining LOG_TEE_TARGET -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") -#else - #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// LOG macro variants with auto endline. -#if !defined(_MSC_VER) || defined(__clang__) - #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") - #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") -#else - #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n") - #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") -#endif - -// INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) -{ - static bool _initialized = false; - static bool _append = false; - static bool _disabled = filename.empty() && target == nullptr; - static std::string log_current_filename{filename}; - static FILE *log_current_target{target}; - static FILE *logfile = nullptr; - - if (change) - { - if (append != LogTriStateSame) - { - _append = append == LogTriStateTrue; - return logfile; - } - - if (disable == LogTriStateTrue) - { - // Disable primary target - _disabled = true; - } - // If previously disabled, only enable, and keep previous target - else if (disable == LogTriStateFalse) - { - _disabled = false; - } - // Otherwise, process the arguments - else if (log_current_filename != filename || log_current_target != target) - { - _initialized = false; - } - } - - if (_disabled) - { - // Log is disabled - return nullptr; - } - - if (_initialized) - { - // with fallback in case something went wrong - return logfile ? logfile : stderr; - } - - // do the (re)initialization - if (target != nullptr) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - - log_current_filename = LOG_DEFAULT_FILE_NAME; - log_current_target = target; - - logfile = target; - } - else - { - if (log_current_filename != filename) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - } - - logfile = fopen(filename.c_str(), _append ? "a" : "w"); - } - - if (!logfile) - { - // Verify whether the file was opened, otherwise fallback to stderr - logfile = stderr; - - fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); - fflush(stderr); - - // At this point we let the init flag be to true below, and let the target fallback to stderr - // otherwise we would repeatedly fopen() which was already unsuccessful - } - - _initialized = true; - - return logfile ? logfile : stderr; -} - -// INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) -{ - return log_handler1_impl(change, append, disable, filename, target); -} - -// Disables logs entirely at runtime. -// Makes LOG() and LOG_TEE() produce no output, -// until enabled back. -#define log_disable() log_disable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_disable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); -} - -// Enables logs at runtime. -#define log_enable() log_enable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_enable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); -} - -// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) -#define log_set_target(target) log_set_target_impl(target) - -// INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } - -// INTERNAL, DO NOT USE -inline FILE *log_handler() { return log_handler1_impl(); } - -// Enable or disable creating separate log files for each run. -// can ONLY be invoked BEFORE first log use. -#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") -// Enable or disable append mode for log file. -// can ONLY be invoked BEFORE first log use. -#define log_append(enable) log_append_impl(enable) -// INTERNAL, DO NOT USE -inline FILE *log_append_impl(bool enable) -{ - return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); -} - -inline void log_test() -{ - log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n"); - log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); - log_set_target(stderr); - LOG("04 Hello World to stderr!\n"); - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n"); - log_set_target(stdout); - LOG("07 Hello World to stdout!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n"); - log_disable(); - LOG("09 Hello World _1_ into the void!\n"); - log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); - log_disable(); - log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); - log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); - log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n"); - log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n"); -#ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n"); - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); - LOG_TEELN("17 Hello msvc TEELN without arguments\n"); - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); - LOG("19 Hello msvc LOG without arguments\n"); - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); - LOGLN("21 Hello msvc LOGLN without arguments\n"); - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); -#endif -} - -inline bool log_param_single_parse(const std::string & param) -{ - if ( param == "--log-test") - { - log_test(); - return true; - } - - if ( param == "--log-disable") - { - log_disable(); - return true; - } - - if ( param == "--log-enable") - { - log_enable(); - return true; - } - - if (param == "--log-new") - { - log_multilog(true); - return true; - } - - if (param == "--log-append") - { - log_append(true); - return true; - } - - return false; -} - -inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) -{ - if ( param == "--log-file") - { - if (!check_but_dont_parse) - { - log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); - } - - return true; - } - - return false; -} - -inline void log_print_usage() -{ - printf("log options:\n"); - /* format - printf(" -h, --help show this help message and exit\n");*/ - /* spacing - printf("__-param----------------Description\n");*/ - printf(" --log-test Run simple logging test\n"); - printf(" --log-disable Disable trace logs\n"); - printf(" --log-enable Enable trace logs\n"); - printf(" --log-file Specify a log filename (without extension)\n"); - printf(" --log-new Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"\n"); - printf(" --log-append Don't truncate the old log file.\n"); - printf("\n"); -} - -#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) - -// INTERNAL, DO NOT USE -inline void log_dump_cmdline_impl(int argc, char **argv) -{ - std::stringstream buf; - for (int i = 0; i < argc; ++i) - { - if (std::string(argv[i]).find(' ') != std::string::npos) - { - buf << " \"" << argv[i] <<"\""; - } - else - { - buf << " " << argv[i]; - } - } - LOGLN("Cmd:%s", buf.str().c_str()); -} - -#define log_tostr(var) log_var_to_string_impl(var).c_str() - -inline std::string log_var_to_string_impl(bool var) -{ - return var ? "true" : "false"; -} - -inline std::string log_var_to_string_impl(std::string var) -{ - return var; -} - -inline std::string log_var_to_string_impl(const std::vector & var) -{ - std::stringstream buf; - buf << "[ "; - bool first = true; - for (auto e : var) - { - if (first) - { - first = false; - } - else - { - buf << ", "; - } - buf << std::to_string(e); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (const auto & token : tokens) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, token); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "'" << detokenized << "'" - << ":" << std::to_string(token); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (int i = 0; i < batch.n_tokens; ++i) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, batch.token[i]); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); - } - buf << " ]"; - - return buf.str(); -} - -#ifdef LOG_DISABLE_LOGS - -#undef LOG -#define LOG(...) // dummy stub -#undef LOGLN -#define LOGLN(...) // dummy stub - -#undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_DISABLE -#define LOG_DISABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_SET_TARGET -#define LOG_SET_TARGET(...) // dummy stub - -#undef LOG_DUMP_CMDLINE -#define LOG_DUMP_CMDLINE(...) // dummy stub - -#endif // LOG_DISABLE_LOGS +#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) +#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) +#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__) +#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 3ca112ef1..7953c723e 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -2,8 +2,11 @@ #include "common.h" #include "log.h" +#include #include +#include #include +#include void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { diff --git a/common/sampling.cpp b/common/sampling.cpp index c07b5e940..e51d07611 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { } std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { - std::string result = "\tlogits "; + std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); diff --git a/common/train.cpp b/common/train.cpp index fef1e57c9..661ad8382 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1,9 +1,11 @@ #include "train.h" #include "common.h" +#include #include #include #include +#include struct random_normal_distribution { std::mt19937 gen; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2c6d5d95b..ff4c9226f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -132,12 +132,14 @@ class Model: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - if len(self.part_names) > 1: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + with open(index_file, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): @@ -145,6 +147,7 @@ class Model: self.tensor_names.update(weight_map.keys()) else: self.tensor_names = tensor_names_from_parts + weight_map = {} for part_name in self.part_names: logger.info(f"gguf: loading model part '{part_name}'") @@ -171,9 +174,17 @@ class Model: data = LazyTorchTensor.from_eager(data) yield name, data - # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}") + else: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: @@ -1841,6 +1852,60 @@ class MiniCPMModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(Model): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + rope_dims = hparams["qk_rope_head_dim"] + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN @@ -2944,6 +3009,66 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("OlmoeForCausalLM") +class OlmoeModel(Model): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -3955,6 +4080,36 @@ class ExaoneModel(Model): super().prepare_tensors() +@Model.register("GraniteForCausalLM") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + if logits_scaling := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scaling) + + ###### CONVERSION LOGIC ###### # tree of lazy tensors diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index ec00fcf78..4a15941f1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -20,6 +21,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + int is_pp_shared = params.is_pp_shared; std::vector n_pp = params.n_pp; @@ -76,7 +79,7 @@ int main(int argc, char ** argv) { const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -93,17 +96,17 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } if (!params.batched_bench_output_jsonl) { - LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); - LOG_TEE("\n"); - LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); - LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); + LOG("\n"); + LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG("\n"); + LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); } for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { @@ -133,7 +136,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -155,7 +158,7 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } @@ -173,20 +176,20 @@ int main(int argc, char ** argv) { const float speed = n_kv / t; if(params.batched_bench_output_jsonl) { - LOG_TEE( + LOG( "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed ); } else { - LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); + LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); } } } } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); llama_batch_free(batch); @@ -196,7 +199,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index f1df20c6e..7887a43d6 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -23,6 +24,7 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -42,7 +44,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: error: unable to load model\n" , __func__); return 1; } @@ -72,31 +74,29 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); return 1; } const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); @@ -114,7 +114,7 @@ int main(int argc, char ** argv) { if (llama_model_has_encoder(model)) { if (llama_encode(ctx, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -142,7 +142,7 @@ int main(int argc, char ** argv) { //} if (n_parallel > 1) { - LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); } // main loop @@ -175,9 +175,9 @@ int main(int argc, char ** argv) { // is it an end of generation? -> mark the stream as finished if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; - LOG_TEE("\n"); + LOG("\n"); if (n_parallel > 1) { - LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); + LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); } continue; @@ -185,8 +185,7 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); } streams[i] += llama_token_to_piece(ctx, new_token_id); @@ -208,27 +207,25 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); - if (n_parallel > 1) { - LOG_TEE("\n"); + LOG("\n"); for (int32_t i = 0; i < n_parallel; ++i) { - LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); + LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); } } const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8ca9f8915..ecff95f9a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_ const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; try { w->token_embedding_table.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wo.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->rms_final_weight.resize(p->dim); - LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); if (shared_weights) { w->wcls = {}; } else { w->wcls.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); } } catch (std::length_error &) { @@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); + LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL } static void print_sample_weights(TransformerWeights *w){ - LOG("----- Quick print of first of the weight vales of all the variables\n"); - LOG("%f\n", w->token_embedding_table[0]); - LOG("%f\n", w->rms_att_weight[0]); - LOG("%f\n", w->rms_ffn_weight[0]); + LOG_INF("----- Quick print of first of the weight vales of all the variables\n"); + LOG_INF("%f\n", w->token_embedding_table[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + LOG_INF("%f\n", w->rms_ffn_weight[0]); - LOG("%f\n", w->wq[0]); - LOG("%f\n", w->wk[0]); - LOG("%f\n", w->wv[0]); - LOG("%f\n", w->wo[0]); - LOG("%f\n", w->w1[0]); - LOG("%f\n", w->w2[0]); - LOG("%f\n", w->w3[0]); - LOG("%f\n", w->rms_att_weight[0]); - if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); + LOG_INF("%f\n", w->wq[0]); + LOG_INF("%f\n", w->wk[0]); + LOG_INF("%f\n", w->wv[0]); + LOG_INF("%f\n", w->wo[0]); + LOG_INF("%f\n", w->w1[0]); + LOG_INF("%f\n", w->w2[0]); + LOG_INF("%f\n", w->w3[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -318,20 +319,20 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); - LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); - LOG("%s: n_embd: %u\n", __func__, params->n_embd); - LOG("%s: n_mult: %u\n", __func__, params->n_mult); - LOG("%s: n_head: %u\n", __func__, params->n_head); - LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - LOG("%s: n_ff: %u\n", __func__, params->n_ff); - LOG("%s: n_layer: %u\n", __func__, params->n_layer); - LOG("%s: n_rot: %u\n", __func__, params->n_rot); + LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd); + LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult); + LOG_INF("%s: n_head: %u\n", __func__, params->n_head); + LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff); + LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer); + LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot); } static void print_tensor_info(const struct ggml_context * ctx) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - LOG("%s: Allocating ", __func__); + LOG_INF("%s: Allocating ", __func__); int64_t total = 1; int i = 0; for (; i < ggml_n_dims(t); ++i) { @@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { - LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); + LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam gguf_free(ctx); } else { // assume llama2.c vocabulary - LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); + LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -871,23 +872,25 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { + gpt_init(); + struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { return 1; } - log_set_target(stdout); + Config config; TransformerWeights weights = {}; { - LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); FILE * file = fopen(params.fn_llama2c_model, "rb"); if (!file) { - LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); return 1; } // read in the config header if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); return 1; } auto shared_weights = config.vocab_size > 0; @@ -896,7 +899,7 @@ int main(int argc, char ** argv) { // read in the Transformer weights alloc_weights(&weights, &config, shared_weights); if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); return 1; } fclose(file); @@ -929,7 +932,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 569b6c38f..41bf4eb2a 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -13,14 +13,15 @@ #include "ggml-metal.h" #endif +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include -#include ////////////////////////////////////////////////// diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index af389abe1..6d629fe4e 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,12 +1,11 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include "ggml.h" #include -#include #include -#include #include /** @@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf(" [\n"); + LOG(" [\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i2 = ne[2] - n; } - printf(" [\n"); + LOG(" [\n"); for (int64_t i1 = 0; i1 < ne[1]; i1++) { if (i1 == n && ne[1] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i1 = ne[1] - n; } - printf(" ["); + LOG(" ["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { if (i0 == n && ne[0] > 2*n) { - printf("..., "); + LOG("..., "); i0 = ne[0] - n; } size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; @@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else { GGML_ABORT("fatal error"); } - printf("%12.4f", v); + LOG("%12.4f", v); sum += v; - if (i0 < ne[0] - 1) printf(", "); + if (i0 < ne[0] - 1) LOG(", "); } - printf("],\n"); + LOG("],\n"); } - printf(" ],\n"); + LOG(" ],\n"); } - printf(" ]\n"); - printf(" sum = %f\n", sum); + LOG(" ]\n"); + LOG(" sum = %f\n", sum); } } @@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); } - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); + LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); // copy the data from the GPU memory if needed @@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) { std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -149,7 +148,7 @@ int main(int argc, char ** argv) { return 1; } - print_build_info(); + gpt_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -166,14 +165,15 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + LOG_ERR("%s : failed to init\n", __func__); return 1; } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } bool OK = run(ctx, params); @@ -181,7 +181,7 @@ int main(int argc, char ** argv) { return 1; } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 90126ad1e..0051a5eb6 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -406,7 +406,7 @@ int main(int argc, char ** argv) { return 1; } - g_verbose = (params.verbosity == 1); + g_verbose = (params.verbosity > 1); try { lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); ctx.run_merge(); diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 8699d9c11..e9e6134d2 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -153,7 +153,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p throw std::invalid_argument("error: invalid parameter for argument: " + arg); } - if (argc - arg_idx < 2) { + if (argc - arg_idx != 2) { throw std::invalid_argument("error: bad arguments"); } @@ -390,10 +390,17 @@ static void gguf_merge(const split_params & split_params) { int n_split = 1; int total_tensors = 0; - auto * ctx_out = gguf_init_empty(); + // avoid overwriting existing output file + if (std::ifstream(split_params.output.c_str())) { + fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); + exit(EXIT_FAILURE); + } + std::ofstream fout(split_params.output.c_str(), std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors + auto * ctx_out = gguf_init_empty(); + std::vector read_data; std::vector ctx_metas; std::vector ctx_ggufs; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 14c715202..20b99a4fd 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -158,6 +158,8 @@ int main(int argc, char * argv[]) { return 1; } + gpt_init(); + llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_context_params cparams = llama_context_params_from_gpt_params(params); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index b1ae64524..572bc71cd 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,7 +3,6 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" -#include "log.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -43,6 +42,11 @@ #include #include +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) + //#define CLIP_DEBUG_FUNCTIONS // RGB uint8 image @@ -168,7 +172,7 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - LOG_TEE("key %s not found in file\n", key); + LOG_ERR("key %s not found in file\n", key); throw std::runtime_error(format("Missing required key: %s", key)); } @@ -273,7 +277,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); } @@ -291,7 +295,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -310,7 +314,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -571,7 +575,7 @@ struct clip_ctx { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; } @@ -585,7 +589,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (load_image_size == nullptr) { load_image_size = clip_image_size_init(); } - LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); image_size_width = load_image_size->width; image_size_height = load_image_size->height; if (is_inf) { @@ -1050,21 +1054,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const int idx_name = gguf_find_key(ctx, KEY_NAME); if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug const std::string name = gguf_get_val_str(ctx, idx_name); - LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); } - LOG_TEE("%s: description: %s\n", __func__, description.c_str()); - LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); - LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); - LOG_TEE("\n"); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, n_kv); + LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_INF("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); // kv const int n_kv = gguf_get_n_kv(ctx); - LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", __func__, n_kv, n_tensors, fname); { std::map n_type; @@ -1075,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { n_type[type]++; } - LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -1091,7 +1095,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } replace_all(value, "\n", "\\n"); - LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -1100,7 +1104,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { continue; } - LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1115,7 +1119,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { size_t tensor_size = ggml_nbytes(cur); model_size += tensor_size; if (verbosity >= 3) { - LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } @@ -1142,27 +1146,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { #ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); - LOG_TEE("%s: CLIP using CUDA backend\n", __func__); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); #endif #ifdef GGML_USE_METAL new_clip->backend = ggml_backend_metal_init(); - LOG_TEE("%s: CLIP using Metal backend\n", __func__); + LOG_INF("%s: CLIP using Metal backend\n", __func__); #endif #ifdef GGML_USE_CANN new_clip->backend = ggml_backend_cann_init(0); - LOG_TEE("%s: CLIP using CANN backend\n", __func__); + LOG_INF("%s: CLIP using CANN backend\n", __func__); #endif #ifdef GGML_USE_VULKAN new_clip->backend = ggml_backend_vk_init(0); - LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); - LOG_TEE("%s: CLIP using CPU backend\n", __func__); + LOG_INF("%s: CLIP using CPU backend\n", __func__); } // model size and capabilities @@ -1197,16 +1201,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->use_gelu = gguf_get_val_bool(ctx, idx); if (verbosity >= 1) { - LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); - LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -1219,7 +1223,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_data = ggml_init(params); if (!new_clip->ctx_data) { - LOG_TEE("%s: ggml_init() failed\n", __func__); + LOG_ERR("%s: ggml_init() failed\n", __func__); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1227,7 +1231,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - LOG_TEE("cannot open model file for loading tensors\n"); + LOG_ERR("cannot open model file for loading tensors\n"); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1249,7 +1253,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { - LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); + LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1320,23 +1324,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 2) { - LOG_TEE("\n%s: vision model hparams\n", __func__); - LOG_TEE("image_size %d\n", hparams.image_size); - LOG_TEE("patch_size %d\n", hparams.patch_size); - LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); - LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); - LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); - LOG_TEE("v_n_head %d\n", hparams.n_head); - LOG_TEE("v_n_layer %d\n", hparams.n_layer); - LOG_TEE("v_eps %f\n", hparams.eps); - LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - LOG_TEE("v_image_grid_pinpoints: "); + LOG_INF("\n%s: vision model hparams\n", __func__); + LOG_INF("image_size %d\n", hparams.image_size); + LOG_INF("patch_size %d\n", hparams.patch_size); + LOG_INF("v_hidden_size %d\n", hparams.hidden_size); + LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_INF("v_projection_dim %d\n", hparams.projection_dim); + LOG_INF("v_n_head %d\n", hparams.n_head); + LOG_INF("v_n_layer %d\n", hparams.n_layer); + LOG_INF("v_eps %f\n", hparams.eps); + LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_INF("v_image_grid_pinpoints: "); for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); + LOG_INF("%d ", hparams.image_grid_pinpoints[i]); } - LOG_TEE("\n"); - LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + LOG_INF("\n"); + LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } @@ -1374,7 +1378,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_TEE("%s: failed to load vision model tensors\n", __func__); + LOG_ERR("%s: failed to load vision model tensors\n", __func__); } // LLaVA projection @@ -1403,7 +1407,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); + // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1504,7 +1508,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; @@ -1555,7 +1559,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; auto * data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); + LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1613,7 +1617,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length int nx, ny, nc; auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to decode image bytes\n", __func__); + LOG_ERR("%s: failed to decode image bytes\n", __func__); return false; } @@ -1622,13 +1626,13 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length //check if image needs downscaling if (nx > maxdims || ny > maxdims) { - LOG_TEE("\nImage requires resizing: original size %d x %d scaling to max %d px\n",nx,ny,maxdims); + printf("\nImage requires resizing: original size %d x %d scaling to max %d px\n",nx,ny,maxdims); uint8_t* resized_image = scale_down_image(data, nx, ny, nc, maxdims, maxdims); if(resized_image!=nullptr) { stbi_image_free(data); // Free the original image buffer and assign the new one data = resized_image; - LOG_TEE("Resized to clamped to %d x %d\n",nx,ny); + printf("Resized to clamped to %d x %d\n",nx,ny); } } @@ -1646,7 +1650,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length } if (need_letterbox) { - LOG_TEE("\nImage requires letterboxing: %d x %d changed to %d x %d\n",nx,ny,new_width, new_height); + printf("\nImage requires letterboxing: %d x %d changed to %d x %d\n",nx,ny,new_width, new_height); uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height); if(letterboxed_image!=nullptr) { @@ -1845,7 +1849,7 @@ static std::pair select_best_resolution(const std::pair & or int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -1963,7 +1967,7 @@ static std::vector> uhd_slice_image(const clip_imag const int multiple = fmin(ceil(ratio), max_slice_nums); std::vector> images; - LOG_TEE("%s: multiple %d\n", __func__, multiple); + LOG_INF("%s: multiple %d\n", __func__, multiple); images.push_back(std::vector()); if (multiple <= 1) { @@ -1978,17 +1982,17 @@ static std::vector> uhd_slice_image(const clip_imag clip_image_u8 * source_image = clip_image_u8_init(); bicubic_resize(*img, *source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); images[images.size()-1].push_back(source_image); std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); clip_image_u8 * refine_image = clip_image_u8_init(); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); // split_to_patches int width = refine_image->nx; @@ -2045,7 +2049,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli int idx = 0; for (size_t i = 0; i < imgs.size(); ++i) { for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); clip_image_f32 * res = clip_image_f32_init(); normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); res_imgs->data[idx++] = *res; @@ -2057,7 +2061,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bool pad_to_square = true; if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } auto & params = ctx->vision_model.hparams; @@ -2134,7 +2138,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } for (size_t i = 0; i < patches.size(); i++) { - // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } @@ -2370,7 +2374,7 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2382,7 +2386,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2612,7 +2616,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i new_type = type; if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); } const size_t n_elms = ggml_nelements(cur); float * f32_data; @@ -2631,7 +2635,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i f32_data = (float *)conv_buf.data(); break; default: - LOG_TEE("Please use an input file in f32 or f16\n"); + LOG_ERR("Please use an input file in f32 or f16\n"); gguf_free(ctx_out); return false; } @@ -2658,7 +2662,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } @@ -2674,8 +2678,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); } return true; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 12fe7345f..8f437863f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -10,6 +10,7 @@ #include #include +#include #include static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { @@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - LOG_TEE("%s: could not load image from base64 string.\n", __func__); + LOG_ERR("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -114,9 +115,9 @@ struct llava_context { }; static void print_usage(int, char ** argv) { - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { @@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - LOG_TEE("using base64 encoded image instead of command line image path\n"); + LOG_INF("using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { - LOG_TEE("%s: can't load image from prompt\n", __func__); + LOG_ERR("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); @@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - LOG_TEE("\n"); + LOG("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); } @@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + LOG("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } gpt_sampler_free(smpl); - printf("\n"); + LOG("\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; @@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - int main(int argc, char ** argv) { ggml_time_init(); @@ -283,27 +278,23 @@ int main(int argc, char ** argv) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } - auto model = llava_init(¶ms); + + auto * model = llava_init(¶ms); if (model == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } if (prompt_contains_image(params.prompt)) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, ""); + auto * image_embed = load_image(ctx_llava, ¶ms, ""); // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); @@ -314,11 +305,11 @@ int main(int argc, char ** argv) { llava_free(ctx_llava); } else { for (auto & image : params.image) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, image); + auto * image_embed = load_image(ctx_llava, ¶ms, image); if (!image_embed) { - std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); return 1; } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index e162586ed..8558c6bdc 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -1,13 +1,23 @@ #include "clip.h" -#include "common.h" -#include "llama.h" #include "llava.h" -#include "base64.hpp" +#include "llama.h" + +#include +#include #include #include +#include +#include #include -#include + +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) // RGB uint8 image struct clip_image_u8 { @@ -54,7 +64,7 @@ static std::pair select_best_resolution(const std::pair& ori int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - LOG_TEE("%s: unable to preprocess image\n", __func__); + LOG_ERR("%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; } @@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli load_image_size->width = img->nx; load_image_size->height = img->ny; clip_add_load_image_size(ctx_clip, load_image_size); - LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding @@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 delete[] img_res_v.data; if (!encoded) { - LOG_TEE("Unable to encode image\n"); + LOG_ERR("Unable to encode image\n"); return false; } @@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); @@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); } - LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); return true; } @@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); return false; } return true; @@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co } float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { - LOG_TEE("Unable to allocate memory for image embeddings\n"); + LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; } int n_img_pos; if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - LOG_TEE("%s: cannot encode image, aborting\n", __func__); + LOG_ERR("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } @@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ } llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; if (llama_decode(ctx_llama, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } *n_past += n_eval; @@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); - LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__); return NULL; } @@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_TEE("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: coulnd't embed the image\n", __func__); return NULL; } @@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { auto file = fopen(path, "rb"); if (file == NULL) { - LOG_TEE("%s: can't read file %s\n", __func__, path); + LOG_ERR("%s: can't read file %s\n", __func__, path); return false; } @@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data if (buffer == NULL) { - LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); perror("Memory allocation error"); fclose(file); return false; @@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, image_path); + LOG_ERR("%s: failed to load %s\n", __func__, image_path); return NULL; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 3ac455e69..c5156c35b 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -7,9 +7,12 @@ #include "llama.h" #include "ggml.h" +#include #include #include +#include #include +#include // TODO: remove me struct llava_context { struct clip_ctx * ctx_clip = NULL; @@ -18,14 +21,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); + LOG("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); ctx_params.n_ctx = 2048; } else { ctx_params.n_ctx = params->n_ctx; @@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->model = model; @@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); return ctx_clip; } @@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vectorctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); slice_embed->embed = image_embed; slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); @@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e else if (has_minicpmv_projector == 3) { system_prompt = "<|im_start|>user\n"; } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); @@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e } eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); } static const char * sample(struct gpt_sampler * smpl, @@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl, } static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto ctx_clip = clip_init_context(params); - auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); + auto * ctx_clip = clip_init_context(params); + auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { - std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str()); return NULL; } // process the prompt if (params->prompt.empty() && params->interactive == false) { - LOG_TEE("prompt should be given or interactive mode should be on"); + LOG_ERR("prompt should be given or interactive mode should be on"); return NULL; } - auto model = llava_init(params); + auto * model = llava_init(params); if (model == NULL) { fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); return NULL; } const int64_t t_llava_init_start_us = ggml_time_us(); - auto ctx_llava = llava_init_context(params, model); + auto * ctx_llava = llava_init_context(params, model); ctx_llava->ctx_clip = ctx_clip; const int64_t t_llava_init_end_us = ggml_time_us(); float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); const int64_t t_process_image_start_us = ggml_time_us(); process_image(ctx_llava, embeds, params, n_past); const int64_t t_process_image_end_us = ggml_time_us(); float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); llava_image_embed_free(embeds); return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par // generate the response - LOG_TEE("\n"); + LOG_INF("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); return smpl; @@ -259,12 +256,7 @@ int main(int argc, char ** argv) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -273,21 +265,23 @@ int main(int argc, char ** argv) { for (auto & image : params.image) { int n_past = 0; - auto ctx_llava = minicpmv_init(¶ms, image, n_past); + auto * ctx_llava = minicpmv_init(¶ms, image, n_past); if (!params.prompt.empty()) { - LOG_TEE("%s\n", params.prompt.c_str()); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + LOG("%s\n", params.prompt.c_str()); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0){ - if(!have_tmp)continue; - else break; + if (!have_tmp) { + continue; + } + break; } if (strstr(tmp, "###")) break; // Yi-VL behavior have_tmp = true; @@ -299,15 +293,15 @@ int main(int argc, char ** argv) { gpt_sampler_free(smpl); }else { while (true) { - LOG_TEE(""); + LOG(""); std::string prompt; std::getline(std::cin, prompt); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index de8b792f2..49870b4a4 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,6 +1,7 @@ #include "arg.h" #include "common.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -42,18 +43,14 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + const int W = 15; // lookahead window const int N = 5; // n-gram size const int G = 15; // max verification n-grams const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookahead", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -75,14 +72,14 @@ int main(int argc, char ** argv) { const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -166,7 +163,7 @@ int main(int argc, char ** argv) { { const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); fflush(stdout); } } @@ -256,7 +253,7 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); + LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__); return 1; } @@ -293,10 +290,10 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id); if (v == 0) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } else { // print light cyan - printf("\033[0;96m%s\033[0m", token_str.c_str()); + LOG("\033[0;96m%s\033[0m", token_str.c_str()); } fflush(stdout); @@ -330,21 +327,21 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { - printf(" - ngram %2d: ", i); + LOG(" - ngram %2d: ", i); const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } - printf("\n"); + LOG("\n"); } } @@ -455,20 +452,20 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("W = %2d\n", W); - LOG_TEE("N = %2d\n", N); - LOG_TEE("G = %2d\n", G); - LOG_TEE("\n"); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_accept = %d\n", n_accept); + LOG_INF("\n"); + LOG_INF("W = %2d\n", W); + LOG_INF("N = %2d\n", N); + LOG_INF("G = %2d\n", G); + LOG_INF("\n"); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_accept = %d\n", n_accept); - LOG_TEE("\n"); + LOG_INF("\n"); gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -482,7 +479,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index f299d68a9..6d1e1ceb9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -5,13 +5,12 @@ #include "llama.h" #include "ggml.h" -#include #include #include +#include #include #include #include -#include int main(int argc, char ** argv){ gpt_params params; @@ -20,6 +19,8 @@ int main(int argc, char ** argv){ return 1; } + gpt_init(); + const int n_draft = params.n_draft; // init llama.cpp @@ -49,7 +50,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -128,7 +129,7 @@ int main(int argc, char ** argv){ const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; - LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); } // After each chunk, update the dynamic ngram cache with the context ngram cache: @@ -136,24 +137,24 @@ int main(int argc, char ** argv){ ngram_cache_context.clear(); } - LOG_TEE("\n"); + LOG("\n"); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); llama_free(ctx); llama_free_model(model); llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index be6f8d7d7..2ccd0e6c1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -3,6 +3,7 @@ #include "common.h" #include "ngram-cache.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -18,17 +19,13 @@ int main(int argc, char ** argv){ return 1; } + gpt_init(); + // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookup", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -58,7 +55,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -76,14 +73,14 @@ int main(int argc, char ** argv){ const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -124,7 +121,7 @@ int main(int argc, char ** argv){ } // print current draft sequence - LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); + LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str()); int i_dft = 0; while (true) { @@ -136,7 +133,7 @@ int main(int argc, char ** argv){ const std::string token_str = llama_token_to_piece(ctx, id); if (!params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } if (llama_token_is_eog(model, id)) { @@ -147,7 +144,7 @@ int main(int argc, char ** argv){ // check if the target token matches the draft if (i_dft < (int) draft.size() && id == draft[i_dft]) { - LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); + LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); ++n_accept; ++n_past; ++i_dft; @@ -161,19 +158,19 @@ int main(int argc, char ** argv){ if (params.use_color) { // color accepted draft token - printf("\033[34m%s\033[0m", token_str.c_str()); + LOG("\033[34m%s\033[0m", token_str.c_str()); fflush(stdout); } continue; } if (params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } fflush(stdout); - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); + LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); draft.clear(); draft.push_back(id); @@ -224,22 +221,22 @@ int main(int argc, char ** argv){ llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ntarget:\n\n"); + LOG_INF("\ntarget:\n\n"); gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -251,7 +248,7 @@ int main(int argc, char ** argv){ llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 890b4c97d..54d51bdc2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,13 +1,12 @@ #include "arg.h" #include "common.h" #include "console.h" +#include "log.h" #include "sampling.h" #include "llama.h" #include "build-info.h" #include -#include -#include #include #include #include @@ -43,11 +42,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -static void print_usage(int, char ** argv) { - printf("\nexample usage:\n"); - printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); - printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); - printf("\n"); +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); } static bool file_exists(const std::string & path) { @@ -75,8 +76,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); + LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -84,7 +84,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -114,7 +114,7 @@ static void sigint_handler(int signo) { need_insert_eot = true; } else { console::cleanup(); - printf("\n"); + LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); @@ -123,17 +123,11 @@ static void sigint_handler(int signo) { } #endif -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { llama_chat_msg new_msg{role, content}; auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); - LOG("formatted: %s\n", formatted.c_str()); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; } @@ -144,55 +138,46 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + auto & sparams = params.sparams; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("main", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - // TODO: Dump params ? - //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); - // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - print_build_info(); + LOG_INF("%s: llama backend init\n", __func__); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -207,21 +192,19 @@ int main(int argc, char ** argv) { g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; ctx = llama_init.context; if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } - LOG("%s: llama threadpool init = n_threads = %d\n", - __func__, - (int) params.cpuparams.n_threads - ); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -233,8 +216,8 @@ int main(int argc, char ** argv) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { threadpool_batch = ggml_threadpool_new(&tpp_batch); if (!threadpool_batch) { - LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); - exit(1); + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; } // Start the non-batch threadpool in the paused state @@ -243,55 +226,54 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { - LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; } llama_attach_threadpool(ctx, threadpool, threadpool_batch); const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print chat template example in conversation mode if (params.conversation) { if (params.enable_chat_template) { - LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); } else { - LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } std::string path_session = params.path_prompt_cache; std::vector session_tokens; if (!path_session.empty()) { - LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); if (!file_exists(path_session)) { - LOG_TEE("%s: session file does not exist, will create.\n", __func__); + LOG_INF("%s: session file does not exist, will create.\n", __func__); } else if (file_is_empty(path_session)) { - LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); - LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } @@ -299,7 +281,8 @@ int main(int argc, char ** argv) { if (!llama_model_has_encoder(model)) { GGML_ASSERT(!llama_add_eos_token(model)); } - LOG("add_bos: %d\n", add_bos); + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); std::vector embd_inp; @@ -308,31 +291,31 @@ int main(int argc, char ** argv) { ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); + LOG_DBG("tokenize the prompt\n"); embd_inp = ::llama_tokenize(ctx, prompt, true, true); } else { - LOG("use session tokens\n"); + LOG_DBG("use session tokens\n"); embd_inp = session_tokens; } - LOG("prompt: \"%s\"\n", log_tostr(prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); } // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } else { - LOG_TEE("error: input is empty\n"); + LOG_ERR("input is empty\n"); return -1; } } // Tokenize negative prompt if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -346,29 +329,28 @@ int main(int argc, char ** argv) { n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_TEE("%s: using full prompt from session file\n", __func__); + LOG_INF("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + LOG_INF("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); } else { - LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); } // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } - LOGLN( - "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu", - log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1); } @@ -390,21 +372,20 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } // ctrl+C handling @@ -424,40 +405,40 @@ int main(int argc, char ** argv) { } if (params.interactive) { - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { - LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + LOG("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } } if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -465,15 +446,15 @@ int main(int argc, char ** argv) { smpl = gpt_sampler_init(model, sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + return 1; } - LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); - LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); - LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -487,9 +468,9 @@ int main(int argc, char ** argv) { GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } - LOG_TEE("\n\n"); + LOG("\n"); if (params.interactive) { const char * control_message; @@ -501,11 +482,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -544,7 +525,7 @@ int main(int argc, char ** argv) { llama_token * enc_input_buf = embd_inp.data(); if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -570,9 +551,8 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); - fflush(stdout); } if (ga_n == 1) { @@ -580,29 +560,35 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() >= n_ctx) { - if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + if (!params.ctx_shift){ + LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; + } else { + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG("after swap: n_past = %d\n", n_past); - - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); - - LOG("clear session path\n"); - path_session.clear(); } } else { // context extension via Self-Extend @@ -611,10 +597,10 @@ int main(int argc, char ** argv) { const int bd = (ga_w/ga_n)*(ga_n - 1); const int dd = (ga_w/ga_n) - ib*bd - ga_w; - LOG("\n"); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); @@ -624,7 +610,7 @@ int main(int argc, char ** argv) { ga_i += ga_w/ga_n; - LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); } } @@ -656,19 +642,19 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } @@ -686,14 +672,14 @@ int main(int argc, char ** argv) { need_to_save_session = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG("saved session to %s\n", path_session.c_str()); + LOG_DBG("saved session to %s\n", path_session.c_str()); } const llama_token id = gpt_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* apply_grammar= */ true); + gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); - // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -703,16 +689,16 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false); + gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -727,7 +713,7 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id, params.special); // Console/Stream Output - fprintf(stdout, "%s", token_str.c_str()); + LOG("%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check @@ -739,8 +725,6 @@ int main(int argc, char ** argv) { output_tokens.push_back(id); output_ss << token_str; } - - fflush(stdout); } } @@ -789,13 +773,13 @@ int main(int argc, char ** argv) { } if (is_antiprompt) { - LOG("found antiprompt: %s\n", last_output.c_str()); + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); } } // deal with end of generation tokens in interactive mode if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { - LOG("found an EOG token\n"); + LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { @@ -809,7 +793,7 @@ int main(int argc, char ** argv) { chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); } is_interacting = true; - printf("\n"); + LOG("\n"); } } @@ -820,21 +804,21 @@ int main(int argc, char ** argv) { } if (n_past > 0 && is_interacting) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); if (params.conversation) { - printf("\n> "); + LOG("\n> "); } if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); + LOG_DBG("adding input prefix BOS token\n"); embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty() && !params.conversation) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - printf("%s", params.input_prefix.c_str()); + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); } // color user input only @@ -857,11 +841,11 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty() && !params.conversation) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - printf("%s", params.input_suffix.c_str()); + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); @@ -878,7 +862,7 @@ int main(int argc, char ** argv) { const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { @@ -901,9 +885,9 @@ int main(int argc, char ** argv) { assistant_ss.str(""); n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -919,7 +903,7 @@ int main(int argc, char ** argv) { // end of generation if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { - LOG_TEE(" [end of text]\n"); + LOG(" [end of text]\n"); break; } @@ -932,11 +916,11 @@ int main(int argc, char ** argv) { } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - LOG_TEE("\n"); + LOG("\n\n"); gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); @@ -950,9 +934,5 @@ int main(int argc, char ** argv) { ggml_threadpool_free(threadpool); ggml_threadpool_free(threadpool_batch); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - return 0; } diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 52aa68bfc..7ef8d14f3 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -24,6 +25,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + int n_junk = params.n_junk; int n_keep = params.n_keep; int n_grp = params.grp_attn_n; @@ -63,7 +66,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return 1; } @@ -77,7 +80,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return 1; } @@ -107,14 +110,14 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); + LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token - LOG_TEE("\n"); - LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); - LOG_TEE("prompt tokens: %d\n", n_tokens_all); - //LOG_TEE("prompt: %s\n", params.prompt.c_str()); + LOG_INF("\n"); + LOG_INF("prefix tokens: %d\n", n_tokens_prefix); + LOG_INF("prompt tokens: %d\n", n_tokens_all); + //LOG_INF("prompt: %s\n", params.prompt.c_str()); llama_batch batch = llama_batch_init(params.n_batch, 0, 1); @@ -145,11 +148,11 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_INF("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); if (i + n_batch >= n_tokens_all) { break; @@ -159,7 +162,7 @@ int main(int argc, char ** argv) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) { const int n_discard = n_batch; - LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -179,18 +182,18 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); } { const int n_discard = n_past - n_ctx + n_predict; if (n_discard > 0) { - LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -201,17 +204,16 @@ int main(int argc, char ** argv) { } } - LOG_TEE("\n"); - LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); - LOG_TEE("\n"); + LOG_INF("\n"); + LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); + LOG_INF("\n"); // main loop int n_cur = n_tokens_all; int n_decode = 0; - LOG_TEE("%s", prompt_suffix.c_str()); - fflush(stdout); + LOG_INF("%s", prompt_suffix.c_str()); const auto t_main_start = ggml_time_us(); @@ -222,13 +224,12 @@ int main(int argc, char ** argv) { // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); n_decode += 1; @@ -243,22 +244,22 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_sampler_free(smpl); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index d08679edb..5971690f1 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,14 +1,16 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include #include +#include // TODO: remove me static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); + LOG("\n"); } struct chunk { @@ -17,7 +19,7 @@ struct chunk { // original file position size_t filepos; // original text data - std::string textdata = ""; + std::string textdata; // tokenized text data std::vector tokens; // embedding @@ -31,14 +33,14 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz std::ifstream f(filename.c_str()); if (!f.is_open()) { - fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); + LOG_ERR("could not open file %s\n", filename.c_str()); return chunks; } chunk current_chunk; char buffer[1024]; int64_t filepos = 0; - std::string current = ""; + std::string current; while (f.read(buffer, 1024)) { current += std::string(buffer, f.gcount()); size_t pos; @@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu if (embd == NULL) { embd = llama_get_embeddings_ith(ctx, i); if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i); continue; } } @@ -116,24 +118,24 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; params.embedding = true; if (params.chunk_size <= 0) { - fprintf(stderr, "chunk_size must be positive\n"); + LOG_ERR("chunk_size must be positive\n"); return 1; } if (params.context_files.empty()) { - fprintf(stderr, "context_files must be specified\n"); + LOG_ERR("context_files must be specified\n"); return 1; } - print_build_info(); - - printf("processing files:\n"); + LOG_INF("processing files:\n"); for (auto & context_file : params.context_files) { - printf("%s\n", context_file.c_str()); + LOG_INF("%s\n", context_file.c_str()); } std::vector chunks; @@ -141,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - printf("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %ld\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); @@ -153,7 +155,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } @@ -162,19 +164,19 @@ int main(int argc, char ** argv) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + LOG_ERR("%s: pooling type NONE not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } // max batch size @@ -185,7 +187,7 @@ int main(int argc, char ** argv) { for (auto & chunk : chunks) { auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -199,12 +201,12 @@ int main(int argc, char ** argv) { // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) chunks.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); + LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } - fprintf(stderr, "\n\n"); + LOG_INF("\n\n"); } } @@ -256,7 +258,7 @@ int main(int argc, char ** argv) { // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { - printf("Enter query: "); + LOG("Enter query: "); std::getline(std::cin, query); std::vector query_tokens = llama_tokenize(ctx, query, true); @@ -280,18 +282,18 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - printf("Top %d similar chunks:\n", params.sparams.top_k); + LOG("Top %d similar chunks:\n", params.sparams.top_k); for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { - printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); - printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); - printf("similarity: %f\n", similarities[i].second); - printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); - printf("--------------------\n"); + LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); + LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); + LOG("similarity: %f\n", similarities[i].second); + LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); + LOG("--------------------\n"); } } } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); // clean up diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 2daac0884..a9ed747f5 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -272,7 +272,6 @@ def start_server_background(args): server_args.append('--cont-batching') server_args.append('--metrics') server_args.append('--flash-attn') - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") pkwargs = { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8730f3ed2..7eeab43a4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,6 +2,7 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" @@ -32,22 +33,34 @@ #include "loading.html.hpp" #include -#include #include #include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) using json = nlohmann::ordered_json; -bool server_verbose = false; -bool server_log_json = true; - enum stop_type { STOP_TYPE_FULL, STOP_TYPE_PARTIAL, @@ -198,6 +211,8 @@ struct server_slot { std::function callback_on_release; void reset() { + SLT_DBG(*this, "%s", "\n"); + n_prompt_tokens = 0; generated_text = ""; truncated = false; @@ -235,8 +250,9 @@ struct server_slot { return state != SLOT_STATE_IDLE; } - void add_token_string(const completion_token_output & token) { + void add_token(const completion_token_output & token) { if (!is_processing()) { + SLT_WRN(*this, "%s", "slot is not processing\n"); return; } generated_token_probs.push_back(token); @@ -244,14 +260,10 @@ struct server_slot { void release() { if (is_processing()) { + SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); + t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; state = SLOT_STATE_IDLE; - LOG_INFO("slot released", { - {"id_slot", id}, - {"id_task", id_task}, - {"n_past", n_past}, - {"truncated", truncated}, - }); callback_on_release(id); } } @@ -299,49 +311,20 @@ struct server_slot { } void print_timings() const { - char buffer[512]; + const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; + const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - double t_token = t_prompt_processing / n_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + const double t_gen = t_token_generation / n_decoded; + const double n_gen_second = 1e3 / t_token_generation * n_decoded; - snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, n_prompt_tokens_processed, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"n_prompt_tokens_processed", n_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - - snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); + SLT_INF(*this, + "\n" + "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r total time = %10.2f ms / %5d tokens\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, + t_token_generation, n_decoded, t_gen, n_gen_second, + t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); } }; @@ -417,8 +400,8 @@ struct server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d, front = %d\n", task.id, front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -434,8 +417,8 @@ struct server_queue { for (auto & task : tasks) { if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -449,6 +432,7 @@ struct server_queue { // Add a new task, but defer until one slot is available void defer(server_task task) { std::unique_lock lock(mutex_tasks); + QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.push_back(std::move(task)); condition_tasks.notify_one(); } @@ -457,7 +441,6 @@ struct server_queue { int get_new_id() { std::unique_lock lock(mutex_tasks); int new_id = id++; - LOG_VERBOSE("new task id", {{"new_id", new_id}}); return new_id; } @@ -499,7 +482,7 @@ struct server_queue { running = true; while (true) { - LOG_VERBOSE("new task may arrive", {}); + QUE_DBG("%s", "processing new tasks\n"); while (true) { std::unique_lock lock(mutex_tasks); @@ -510,21 +493,22 @@ struct server_queue { server_task task = queue_tasks.front(); queue_tasks.pop_front(); lock.unlock(); - LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); + + QUE_DBG("processing task, id = %d\n", task.id); callback_new_task(task); } // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_update_slots", {}); + QUE_DBG("%s", "update slots\n"); callback_update_slots(); - LOG_VERBOSE("wait for new task", {}); + QUE_DBG("%s", "waiting for new tasks\n"); { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { if (!running) { - LOG_VERBOSE("ending start_loop", {}); + QUE_DBG("%s", "terminate\n"); return; } condition_tasks.wait(lock, [&]{ @@ -548,7 +532,7 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - LOG_VERBOSE("waiting for task id", {{"id_task", id_task}}); + SRV_DBG("waiting for task id = %d\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.insert(id_task); @@ -562,7 +546,7 @@ struct server_response { // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); + SRV_DBG("task id = %d is done\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); @@ -596,12 +580,13 @@ struct server_response { // Send a new result to a waiting id_task void send(server_task_result & result) { - LOG_VERBOSE("send new result", {{"id_task", result.id}}); + SRV_DBG("sending result for task id = %d\n", result.id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { if (result.id == id_task) { - LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}}); + SRV_DBG("task id = %d moved to result queue\n", result.id); + queue_results.push_back(std::move(result)); condition_results.notify_all(); return; @@ -613,7 +598,7 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector lora_adapters; + std::vector loras; gpt_params params; @@ -673,11 +658,13 @@ struct server_context { llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; - ctx = llama_init.context; - lora_adapters = llama_init.lora_adapters; + ctx = llama_init.context; + loras = llama_init.lora_adapters; + params.n_parallel -= 1; // but be sneaky about it + if (model == nullptr) { - LOG_ERROR("unable to load model", {{"model", params.model}}); + SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); return false; } @@ -700,7 +687,7 @@ struct server_context { void init() { const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -709,10 +696,7 @@ struct server_context { slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { - {"id_slot", slot.id}, - {"n_ctx_slot", slot.n_ctx} - }); + SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -723,11 +707,7 @@ struct server_context { //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { - {"id_slot", slot.id}, - {"ga_n", ga_n}, - {"ga_w", ga_w} - }); + SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); } slot.ga_i = 0; @@ -850,11 +830,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lcp similarity", { - {"id_slot", ret->id}, - {"max_lcp_len", max_lcp_len}, - {"similarity", similarity}, - }); + SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); } } @@ -875,10 +851,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lru", { - {"id_slot", ret->id}, - {"t_last", t_last}, - }); + SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last); } } @@ -942,17 +915,14 @@ struct server_context { } if (slot.params.cache_prompt && slot.ga_n != 1) { - LOG_WARNING("cache_prompt is not supported with group-attention", {}); slot.params.cache_prompt = false; + SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? - LOG_WARNING("Max tokens to predict exceeds server configuration", { - {"params.n_predict", slot.params.n_predict}, - {"slot.n_predict", slot.n_predict}, - }); slot.params.n_predict = slot.n_predict; + SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } // infill @@ -1061,16 +1031,13 @@ struct server_context { slot.state = SLOT_STATE_PROCESSING_PROMPT; slot.prompt_tokens.clear(); - LOG_INFO("slot is processing task", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - }); + SLT_INF(slot, "%s", "processing task\n"); return true; } void kv_cache_clear() { - LOG_VERBOSE("clearing KV cache", {}); + SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache llama_kv_cache_clear(ctx); @@ -1078,9 +1045,7 @@ struct server_context { } void system_prompt_update() { - LOG_VERBOSE("system prompt update", { - {"system_prompt", system_prompt}, - }); + SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); kv_cache_clear(); system_tokens.clear(); @@ -1101,7 +1066,7 @@ struct server_context { } if (llama_decode(ctx, batch) != 0) { - LOG_ERROR("llama_decode() failed", {}); + SRV_ERR("%s", "llama_decode() failed\n"); return; } } @@ -1116,11 +1081,9 @@ struct server_context { } bool system_prompt_set(const std::string & sys_prompt) { - system_prompt = sys_prompt; + SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - LOG_VERBOSE("system prompt process", { - {"system_prompt", system_prompt}, - }); + system_prompt = sys_prompt; // release all slots for (server_slot & slot : slots) { @@ -1188,7 +1151,7 @@ struct server_context { // add the token to slot queue and cache } - slot.add_token_string(result); + slot.add_token(result); if (slot.params.stream) { send_partial_response(slot, result); } @@ -1203,55 +1166,30 @@ struct server_context { slot.stopped_limit = true; slot.has_next_token = false; - LOG_VERBOSE("stopped by limit", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_decoded", slot.n_decoded}, - {"n_predict", slot.params.n_predict}, - }); + SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } if (llama_token_is_eog(model, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); + SLT_DBG(slot, "%s", "stopped by EOS\n"); } - auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 - && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { - LOG_WARNING("n_predict is not set and self-context extend is disabled." - " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { - { "id_slot", slot.id }, - { "params.n_predict", slot.params.n_predict }, - { "slot.n_prompt_tokens", slot.n_prompt_tokens }, - { "slot.n_decoded", slot.n_decoded }, - { "slot.n_predict", slot.n_predict }, - { "n_slots", params.n_parallel }, - { "slot.n_ctx", slot.n_ctx }, - { "n_ctx", n_ctx }, - { "n_ctx_train", n_ctx_train }, - { "ga_n", slot.ga_n }, - }); + const auto n_ctx_train = llama_n_ctx_train(model); + + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction + + SLT_WRN(slot, + "n_predict (%d) is not set and self-context extend is disabled. " + "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", + slot.params.n_predict, n_ctx_train); } - LOG_VERBOSE("next token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); return slot.has_next_token; // continue } @@ -1308,10 +1246,7 @@ struct server_context { } void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - LOG_ERROR("task error", { - {"id_task", id_task}, - {"error", error}, - }); + SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); server_task_result res; res.id = id_task; @@ -1430,10 +1365,7 @@ struct server_context { } if (embd == NULL) { - LOG_ERROR("failed to get embeddings", { - {"token", batch.token [i]}, - {"seq_id", batch.seq_id[i][0]} - }); + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); res.data = json { {"embedding", std::vector(n_embd, 0.0f)}, @@ -1450,6 +1382,8 @@ struct server_context { }; } + SLT_DBG(slot, "%s", "sending embeddings\n"); + queue_results.send(res); } @@ -1466,7 +1400,7 @@ struct server_context { task.type = SERVER_TASK_TYPE_COMPLETION; if (replace_prompt) { task.data = task_data; - task.data["prompt"] = prompt; + task.data["prompt"] = std::move(prompt); } else { task.data = std::move(task_data); } @@ -1510,7 +1444,8 @@ struct server_context { std::vector cancel_tasks; cancel_tasks.reserve(id_tasks.size()); for (const auto & id_task : id_tasks) { - LOG_VERBOSE("cancel task", {{"id_task", id_task}}); + SRV_WRN("cancel task, id_task = %d\n", id_task); + server_task task; task.type = SERVER_TASK_TYPE_CANCEL; task.id_target = id_task; @@ -1522,7 +1457,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl - void receive_cmpl_results(const std::unordered_set & id_tasks, std::function&)> result_handler, std::function error_handler) { + void receive_cmpl_results( + const std::unordered_set & id_tasks, + const std::function&)> & result_handler, + const std::function & error_handler) { // TODO: currently, there is no way to detect the client has cancelled the request std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { @@ -1541,7 +1479,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl, in stream mode - void receive_cmpl_results_stream(const std::unordered_set & id_tasks, std::function result_handler, std::function error_handler) { + void receive_cmpl_results_stream( + const std::unordered_set & id_tasks, const + std::function & result_handler, const + std::function & error_handler) { size_t n_finished = 0; while (true) { server_task_result result = queue_results.recv(id_tasks); @@ -1589,13 +1530,13 @@ struct server_context { if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); + SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1617,7 +1558,7 @@ struct server_context { slot->index = json_value(task.data, "index", 0); if (!launch_slot_with_task(*slot, task)) { - LOG_ERROR("error while launching slot", task.data); + SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); break; } } break; @@ -1666,18 +1607,7 @@ struct server_context { slots_data.push_back(slot_data); } - LOG_INFO("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots} - }); - - LOG_VERBOSE("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots}, - {"slots", slots_data} - }); + SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); server_task_result res; res.id = task.id; @@ -1723,7 +1653,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1764,7 +1694,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1812,7 +1742,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1834,7 +1764,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - llama_lora_adapters_apply(ctx, lora_adapters); + llama_lora_adapters_apply(ctx, loras); server_task_result result; result.id = task.id; result.stop = true; @@ -1862,7 +1792,7 @@ struct server_context { } if (all_idle) { - LOG_INFO("all slots are idle", {}); + SRV_INF("%s", "all slots are idle\n"); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); } @@ -1872,7 +1802,7 @@ struct server_context { } { - LOG_VERBOSE("posting NEXT_RESPONSE", {}); + SRV_DBG("%s", "posting NEXT_RESPONSE\n"); server_task task; task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; @@ -1891,17 +1821,7 @@ struct server_context { const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - LOG_INFO("slot context shift", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_keep", n_keep}, - {"n_left", n_left}, - {"n_discard", n_discard}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()} - }); + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1944,15 +1864,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - LOG_VERBOSE("slot decode token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()}, - {"truncated", slot.truncated} - }); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1973,10 +1886,7 @@ struct server_context { // we haven't tokenized the prompt yet - do it now: if (prompt_tokens.empty()) { - LOG_VERBOSE("tokenizing prompt", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size()); slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; @@ -2020,21 +1930,11 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("prompt tokenized", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { - LOG_INFO("empty prompt - releasing slot", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); slot.release(); slot.print_timings(); @@ -2076,15 +1976,7 @@ struct server_context { slot.truncated = true; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("input truncated", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } @@ -2109,10 +2001,7 @@ struct server_context { if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { - { "id_slot", slot.id }, - { "id_task", slot.id_task } - }); + SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; if (slot.ga_i > 0) { @@ -2161,11 +2050,7 @@ struct server_context { // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - LOG_INFO("kv cache rm [p0, end)", { - { "id_slot", slot.id }, - { "id_task", slot.id_task }, - { "p0", p0 } - }); + SLT_INF(slot, "kv cache rm [%d, end)\n", p0); int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; @@ -2194,13 +2079,7 @@ struct server_context { slot_npast++; } - LOG_VERBOSE("prompt processing progress", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, - }); + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); // entire prompt has been processed if (slot.n_past == slot.n_prompt_tokens) { @@ -2214,12 +2093,7 @@ struct server_context { slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - LOG_VERBOSE("prompt done", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - }); + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); } } @@ -2230,13 +2104,11 @@ struct server_context { } if (batch.n_tokens == 0) { - LOG_VERBOSE("no tokens to decode", {}); + SRV_WRN("%s", "no tokens to decode\n"); return; } - LOG_VERBOSE("decoding batch", { - {"n_tokens", batch.n_tokens}, - }); + SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); // make sure we're in the right embedding mode llama_set_embeddings(ctx, batch_type == 1); @@ -2254,10 +2126,9 @@ struct server_context { const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); @@ -2267,7 +2138,7 @@ struct server_context { slot.ga_i += slot.ga_w / slot.ga_n; - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); } slot.n_past_se += n_tokens; @@ -2291,11 +2162,7 @@ struct server_context { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); for (auto & slot : slots) { slot.release(); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); @@ -2307,11 +2174,7 @@ struct server_context { n_batch /= 2; i -= n_batch; - LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); continue; // continue loop of n_batch } @@ -2371,7 +2234,7 @@ struct server_context { } } - LOG_VERBOSE("run slots completed", {}); + SRV_DBG("%s", "run slots completed\n"); } json model_meta() const { @@ -2392,19 +2255,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } - LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + LOG_DBG("request: %s\n", req.body.c_str()); + LOG_DBG("response: %s\n", res.body.c_str()); } std::function shutdown_handler; @@ -2422,9 +2276,6 @@ inline void signal_handler(int signal) { } int main(int argc, char ** argv) { -#if SERVER_VERBOSE != 1 - log_disable(); -#endif // own arguments required by this example gpt_params params; @@ -2432,9 +2283,11 @@ int main(int argc, char ** argv) { return 1; } - // TODO: not great to use extern vars - server_log_json = params.log_json; - server_verbose = params.verbosity > 0; + gpt_init(); + + // enabling this will output extra debug information in the HTTP responses from the server + // see format_final_response_oaicompat() + const bool verbose = params.verbosity > 9; // struct that contains llama context and inference server_context ctx_server; @@ -2450,27 +2303,20 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - LOG_INFO("build info", { - {"build", LLAMA_BUILD_NUMBER}, - {"commit", LLAMA_COMMIT} - }); - - LOG_INFO("system info", { - {"n_threads", params.cpuparams.n_threads}, - {"n_threads_batch", params.cpuparams_batch.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}}); + LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); svr.reset( new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) ); } else { - LOG_INFO("Running without SSL", {}); + LOG_INF("Running without SSL\n"); svr.reset(new httplib::Server()); } #else @@ -2492,13 +2338,13 @@ int main(int argc, char ** argv) { svr->set_logger(log_server_request); - auto res_error = [](httplib::Response & res, json error_data) { + auto res_error = [](httplib::Response & res, const json & error_data) { json final_response {{"error", error_data}}; res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; - auto res_ok = [](httplib::Response & res, json data) { + auto res_ok = [](httplib::Response & res, const json & data) { res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = 200; }; @@ -2506,7 +2352,7 @@ int main(int argc, char ** argv) { svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { std::string message; try { - std::rethrow_exception(std::move(ep)); + std::rethrow_exception(ep); } catch (std::exception & e) { message = e.what(); } catch (...) { @@ -2514,7 +2360,7 @@ int main(int argc, char ** argv) { } json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_VERBOSE("Got exception", formatted_error); + LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); res_error(res, formatted_error); }); @@ -2589,7 +2435,7 @@ int main(int argc, char ** argv) { // API key is invalid or not provided res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - LOG_WARNING("Unauthorized: Invalid API Key", {}); + LOG_WRN("Unauthorized: Invalid API Key\n"); return false; }; @@ -2926,14 +2772,14 @@ int main(int argc, char ** argv) { } res_ok(res, arr); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { return server_sent_event(sink, "data", result.data); - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); sink.done(); @@ -2954,7 +2800,7 @@ int main(int argc, char ** argv) { }; // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2971,16 +2817,16 @@ int main(int argc, char ** argv) { const auto completion_id = gen_chatcmplid(); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id); + json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); res_ok(res, result_oai); - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); for (auto & event_data : result_array) { if (event_data.empty()) { @@ -2991,7 +2837,7 @@ int main(int argc, char ** argv) { } } return true; // ok - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); static const std::string ev_done = "data: [DONE]\n\n"; @@ -3104,7 +2950,7 @@ int main(int argc, char ** argv) { for (const auto & res : results) { responses.push_back(res.data); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); error = true; }); @@ -3123,12 +2969,12 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); - for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { - auto & la = ctx_server.lora_adapters[i]; + for (size_t i = 0; i < ctx_server.loras.size(); ++i) { + auto & lora = ctx_server.loras[i]; result.push_back({ {"id", i}, - {"path", la.path}, - {"scale", la.scale}, + {"path", lora.path}, + {"scale", lora.scale}, }); } res_ok(res, result); @@ -3137,11 +2983,11 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { const std::vector body = json::parse(req.body); - int max_idx = ctx_server.lora_adapters.size(); + int max_idx = ctx_server.loras.size(); // clear existing value - for (auto & la : ctx_server.lora_adapters) { - la.scale = 0.0f; + for (auto & lora : ctx_server.loras) { + lora.scale = 0.0f; } // set value @@ -3149,7 +2995,7 @@ int main(int argc, char ** argv) { int id = entry.at("id"); float scale = entry.at("scale"); if (0 <= id && id < max_idx) { - ctx_server.lora_adapters[id].scale = scale; + ctx_server.loras[id].scale = scale; } else { throw std::runtime_error("invalid adapter id"); } @@ -3244,59 +3090,58 @@ int main(int argc, char ** argv) { // bind HTTP listen port, run the HTTP server in a thread if (!svr->bind_to_port(params.hostname, params.port)) { - LOG_ERROR("couldn't bind HTTP server socket", { - {"hostname", params.hostname}, - {"port", params.port}, - }); + //LOG_ERROR("couldn't bind HTTP server socket", { + // {"hostname", params.hostname}, + // {"port", params.port}, + //}); + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); clean_up(); - LOG_ERROR("exiting due to HTTP server error", {}); return 1; } std::thread t([&]() { svr->listen_after_bind(); }); svr->wait_until_ready(); - LOG_INFO("HTTP server is listening", log_data); + LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); // load the model - LOG_INFO("loading model", log_data); + LOG_INF("%s: loading model\n", __func__); + if (!ctx_server.load_model(params)) { clean_up(); t.join(); - LOG_ERROR("exiting due to model loading error", {}); + LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; - } else { - ctx_server.init(); - state.store(SERVER_STATE_READY); - - LOG_INFO("model loaded", {}); - - // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { - LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - params.chat_template = "chatml"; - } - } - - // print sample chat example to make it clear which template is used - { - LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, - {"built_in", params.chat_template.empty()}, - }); - } - - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); - - shutdown_handler = [&](int) { - ctx_server.queue_tasks.terminate(); - }; - ctx_server.queue_tasks.start_loop(); } + ctx_server.init(); + state.store(SERVER_STATE_READY); + + LOG_INF("%s: model loaded\n", __func__); + + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { + if (!ctx_server.validate_model_chat_template()) { + LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); + params.chat_template = "chatml"; + } + } + + // print sample chat example to make it clear which template is used + LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); + + ctx_server.queue_tasks.on_new_task(std::bind( + &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( + &server_context::update_slots, &ctx_server)); + + shutdown_handler = [&](int) { + ctx_server.queue_tasks.terminate(); + }; + + LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + + ctx_server.queue_tasks.start_loop(); + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore new file mode 100644 index 000000000..1d17dae13 --- /dev/null +++ b/examples/server/tests/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 0f4249b13..062f084be 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1372,8 +1372,6 @@ def start_server_background(context): server_args.append('--verbose') if context.lora_file: server_args.extend(['--lora', context.lora_file]) - if 'SERVER_LOG_FORMAT_JSON' not in os.environ: - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [context.server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index adb1a1cb9..f093f547f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -1,7 +1,8 @@ #pragma once -#include "llama.h" #include "common.h" +#include "log.h" +#include "llama.h" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -15,10 +16,10 @@ #define JSON_ASSERT GGML_ASSERT #include "json.hpp" +#include +#include #include #include -#include -#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -35,32 +36,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -extern bool server_verbose; -extern bool server_log_json; - -#ifndef SERVER_VERBOSE -#define SERVER_VERBOSE 1 -#endif - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (server_verbose) \ - { \ - server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul try { return body.at(key); } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - std::stringstream ss; - ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; - LOG_WARNING(ss.str().c_str(), body); + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); return default_value; } } else { @@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; - - if (server_log_json) { - log.merge_patch({ - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - - if (!extra.empty()) { - log.merge_patch(extra); - } - - printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - - if (!extra.empty()) { - log.merge_patch(extra); - } - std::stringstream ss; - ss << buf << " |"; - for (const auto & el : log.items()) - { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - ss << " " << el.key() << "=" << value; - } - - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - } - fflush(stdout); -} - // // chat template utils // @@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); - LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); + const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); + return formatted_chat; } @@ -243,10 +175,7 @@ static std::string random_string() { } static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - - return chatcmplid.str(); + return "chatcmpl-" + random_string(); } // @@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin return std::string::npos; } -static bool json_is_array_of_numbers(json data) { +static bool json_is_array_of_numbers(const json & data) { if (data.is_array()) { for (const auto & e : data) { if (!e.is_number()) { @@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector unsupported_params { "tools", "tool_choice" }; - for (auto & param : unsupported_params) { + for (const auto & param : unsupported_params) { if (body.contains(param)) { throw std::runtime_error("Unsupported param: " + param); } @@ -444,7 +374,7 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { +static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { bool stopped_word = result.count("stopped_word") != 0; bool stopped_eos = json_value(result, "stopped_eos", false); int num_tokens_predicted = json_value(result, "tokens_predicted", 0); @@ -481,7 +411,8 @@ static json format_final_response_oaicompat(const json & request, json result, c {"id", completion_id} }; - if (server_verbose) { + // extra fields for debugging purposes + if (verbose) { res["__verbose"] = result; } @@ -493,7 +424,7 @@ static json format_final_response_oaicompat(const json & request, json result, c } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { +static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { return std::vector({result}); } @@ -595,7 +526,7 @@ static std::vector format_partial_response_oaicompat(json result, const st static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; - for (auto & elem : embeddings) { + for (const auto & elem : embeddings) { data.push_back(json{ {"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 0c923d4ed..c2b7267c8 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,16 +1,14 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" -#include -#include -#include #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -23,6 +21,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + // total length of the sequence including the prompt const int n_predict = params.n_predict; @@ -69,25 +69,24 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); + LOG("\n"); + LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); + LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch with size 512 // we use this object to submit token data for decoding @@ -102,7 +101,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG("%s: llama_decode() failed\n", __func__); return 1; } @@ -116,16 +115,16 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // prepare the next batch @@ -141,23 +140,23 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_batch_free(batch); llama_sampler_free(smpl); diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index a8cf0aa64..3b9ba3b2d 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -11,16 +11,17 @@ source /opt/intel/oneapi/setvars.sh #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" -MODEL_FILE=llama-2-7b.Q4_0.gguf +MODEL_FILE=models/llama-2-7b.Q4_0.gguf NGL=33 +CONEXT=8192 if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} fi diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index c817be566..a9af6471f 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -1,11 +1,13 @@ #include "common.h" +//#include "log.h" // TODO: start using log.h #include "llama.h" -#include #include +#include #include #include #include +#include // TODO: remove me #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -13,25 +15,25 @@ #include // For CommandLineToArgvW #endif -static void print_usage_information(const char * argv0, FILE * stream) { - fprintf(stream, "usage: %s [options]\n\n", argv0); - fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); - fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); - fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); - fprintf(stream, "to control the behavior of the tokenizer.\n\n"); - fprintf(stream, " The possible options are:\n"); - fprintf(stream, "\n"); - fprintf(stream, " -h, --help print this help and exit\n"); - fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); - fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); - fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); - fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); - fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); - fprintf(stream, " --stdin read prompt from standard input.\n"); - fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); - fprintf(stream, " --no-parse-special do not parse control tokens.\n"); - fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); - fprintf(stream, " --show-count print the total number of tokens.\n"); +static void print_usage_information(const char * argv0) { + printf("usage: %s [options]\n\n", argv0); + printf("The tokenize program tokenizes a prompt using a given model,\n"); + printf("and prints the resulting tokens to standard output.\n\n"); + printf("It needs a model file, a prompt, and optionally other flags\n"); + printf("to control the behavior of the tokenizer.\n\n"); + printf(" The possible options are:\n"); + printf("\n"); + printf(" -h, --help print this help and exit\n"); + printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n"); + printf(" --ids if given, only print numerical token IDs, and not token strings.\n"); + printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); + printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); + printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); + printf(" --stdin read prompt from standard input.\n"); + printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + printf(" --no-parse-special do not parse control tokens.\n"); + printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n"); + printf(" --show-count print the total number of tokens.\n"); } static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { @@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) { const int argc = argv.size(); if (argc <= 1) { - print_usage_information(argv[0].c_str(), stderr); + print_usage_information(argv[0].c_str()); return 1; } @@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) { for (; iarg < argc; ++iarg) { std::string arg{argv[iarg]}; if (arg == "-h" || arg == "--help") { - print_usage_information(argv[0].c_str(), stdout); + print_usage_information(argv[0].c_str()); return 0; } else if (arg == "--ids") { @@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) { // Start actually doing the tokenizing stuff. ////// -#ifdef LOG_DISABLE_LOGS - disable_logging = true; -#endif - if (disable_logging) { llama_log_set(llama_log_callback_null, NULL); } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 07c4bfe93..b7a7d03c9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -570,10 +570,11 @@ extern "C" { }; enum ggml_log_level { - GGML_LOG_LEVEL_ERROR = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4, - GGML_LOG_LEVEL_DEBUG = 5 + GGML_LOG_LEVEL_NONE = 0, + GGML_LOG_LEVEL_INFO = 1, + GGML_LOG_LEVEL_WARN = 2, + GGML_LOG_LEVEL_ERROR = 3, + GGML_LOG_LEVEL_DEBUG = 4, }; enum ggml_tensor_flag { diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 72cb83c9b..27375d0d7 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -4,6 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include #include diff --git a/ggml/src/ggml-cpu-impl.h b/ggml/src/ggml-cpu-impl.h new file mode 100644 index 000000000..5b45155b0 --- /dev/null +++ b/ggml/src/ggml-cpu-impl.h @@ -0,0 +1,614 @@ +#pragma once + +// GGML CPU internal header + +#include "ggml.h" +#include "ggml-impl.h" +#include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ +//#include +#include +#include // memcpy +#include // fabsf + + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_MSC_VER) + +#define m512bh(p) p +#define m512i(p) p + +#else + +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) + +#endif + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. + * This code should vectorize nicely if using modern compilers. + */ +static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { + ggml_bf16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h.bits = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; + return h; +} + +#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) +#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#endif + +// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available +#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __SSE3__ +#define __SSE3__ +#endif +#ifndef __SSSE3__ +#define __SSSE3__ +#endif +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#include +#endif + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#if defined(__ARM_NEON) + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#ifdef _MSC_VER + +typedef uint16_t ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +typedef __fp16 ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif // _MSC_VER + +#if !defined(__aarch64__) + +// 32-bit ARM compatibility + +// vaddlvq_s16 +// vpaddq_s16 +// vpaddq_s32 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 + +inline static int32_t vaddlvq_s16(int16x8_t v) { + int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); + return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct ggml_int16x8x2_t { + int16x8_t val[2]; +} ggml_int16x8x2_t; + +inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { + ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct ggml_uint8x16x2_t { + uint8x16_t val[2]; +} ggml_uint8x16x2_t; + +inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { + ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct ggml_uint8x16x4_t { + uint8x16_t val[4]; +} ggml_uint8x16x4_t; + +inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { + ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct ggml_int8x16x2_t { + int8x16_t val[2]; +} ggml_int8x16x2_t; + +inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { + ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct ggml_int8x16x4_t { + int8x16_t val[4]; +} ggml_int8x16x4_t; + +inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +// NOTE: not tested +inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + uint8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +#else + +#define ggml_int16x8x2_t int16x8x2_t +#define ggml_uint8x16x2_t uint8x16x2_t +#define ggml_uint8x16x4_t uint8x16x4_t +#define ggml_int8x16x2_t int8x16x2_t +#define ggml_int8x16x4_t int8x16x4_t + +#define ggml_vld1q_s16_x2 vld1q_s16_x2 +#define ggml_vld1q_u8_x2 vld1q_u8_x2 +#define ggml_vld1q_u8_x4 vld1q_u8_x4 +#define ggml_vld1q_s8_x2 vld1q_s8_x2 +#define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 + +#endif // !defined(__aarch64__) + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + +#endif // defined(__ARM_NEON) + +#if defined(__ARM_NEON) && !defined(_MSC_VER) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + ggml_fp16_internal_t tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + ggml_fp16_internal_t tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#if defined(__loongarch64) +#if defined(__loongarch_asx) +#include +#endif +#if defined(__loongarch_sx) +#include +#endif +#endif + +#if defined(__loongarch_asx) + +typedef union { + int32_t i; + float f; +} ft_union; + +/* float type data load instructions */ +static __m128 __lsx_vreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static __m256 __lasx_xvreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); +} +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) + +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +// precomputed f32 table for f16 (256 KB) +// defined in ggml.c, initialized in ggml_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index cb7f7728b..833984190 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -1,15 +1,17 @@ #pragma once -#include "ggml.h" - // GGML internal header +#include "ggml.h" + #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ -#include #include -#include // memcpy -#include // fabsf +#include + +#ifdef __cplusplus +extern "C" { +#endif #undef MIN #undef MAX @@ -17,96 +19,6 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#if defined(_MSC_VER) - -#define m512bh(p) p -#define m512i(p) p - -#else - -#define m512bh(p) (__m512bh)(p) -#define m512i(p) (__m512i)(p) - -#endif - -/** - * Converts brain16 to float32. - * - * The bfloat16 floating point format has the following structure: - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───┐ - * 0b0000000000000000 brain16 - * - * Since bf16 has the same number of exponent bits as a 32bit float, - * encoding and decoding numbers becomes relatively straightforward. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───────────────────┐ - * 0b00000000000000000000000000000000 IEEE binary32 - * - * For comparison, the standard fp16 format has fewer exponent bits. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌─┴─┐┌─┴──────┐ - * 0b0000000000000000 IEEE binary16 - * - * @see IEEE 754-2008 - */ -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -/** - * Converts float32 to brain16. - * - * This is binary identical with Google Brain float conversion. - * Floats shall round to nearest even, and NANs shall be quiet. - * Subnormals aren't flushed to zero, except perhaps when used. - * This code should vectorize nicely if using modern compilers. - */ -static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { - ggml_bf16_t h; - union { - float f; - uint32_t i; - } u; - u.f = s; - if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ - h.bits = (u.i >> 16) | 64; /* force to quiet */ - return h; - } - h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; - return h; -} - -#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) -#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) - -#ifdef __cplusplus -extern "C" { -#endif - // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -121,520 +33,6 @@ extern "C" { #endif #endif -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#endif - -// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available -#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __SSE3__ -#define __SSE3__ -#endif -#ifndef __SSSE3__ -#define __SSSE3__ -#endif -#endif - -#if defined(__ARM_FEATURE_SVE) -#include -#include -#endif - -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#ifdef _MSC_VER - -typedef uint16_t ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -typedef __fp16 ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif // _MSC_VER - -#if !defined(__aarch64__) - -// 32-bit ARM compatibility - -// vaddlvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddlvq_s16(int16x8_t v) { - int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); - return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif // !defined(__aarch64__) - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif // !defined(__ARM_FEATURE_DOTPROD) - -#endif // defined(__ARM_NEON) - -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - ggml_fp16_internal_t tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - ggml_fp16_internal_t tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; -} - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#if defined(__loongarch64) -#if defined(__loongarch_asx) -#include -#endif -#if defined(__loongarch_sx) -#include -#endif -#endif - -#if defined(__loongarch_asx) - -typedef union { - int32_t i; - float f; -} ft_union; - -/* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); -} - -static __m256 __lasx_xvreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); -} -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) - -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -extern float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif - -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif - -enum ggml_cgraph_eval_order { - GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, - GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, - GGML_CGRAPH_EVAL_ORDER_COUNT -}; - // bitset typedef uint32_t ggml_bitset_t; @@ -761,6 +159,12 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g // computation graph +enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT +}; + struct ggml_cgraph { int size; int n_nodes; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 88821dd4c..1641a2f5c 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -13,13 +13,16 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) #ifdef GGML_METAL_NDEBUG +#define GGML_METAL_LOG(...) #define GGML_METAL_LOG_INFO(...) #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else +#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__) #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", + GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, @@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); } } else { GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", @@ -3224,15 +3225,19 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = nil; + + if (size_aligned > 0) { + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } } - if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { + if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); @@ -3309,14 +3314,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_aligned); @@ -3332,14 +3340,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, for (size_t i = 0; i < size; i += size_step) { const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_step_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_step_aligned); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index c757a08b6..5618bd427 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include @@ -231,6 +232,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } + +static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { + const __m128i ax = _mm_sign_epi8(x, x); + const __m128i sy = _mm_sign_epi8(y, x); + return _mm_maddubs_epi16(ax, sy); +} #endif #elif defined(__SSSE3__) // horizontally add 4x4 floats @@ -4207,37 +4214,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc); #elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); + const __m128i mone = _mm_set1_epi16(1); - // Main loop - for (; ib < nb; ++ib) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); - - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); + const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); + const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); + const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); + accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2); } - sumf = hsum_float_8(acc); + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); #elif defined(__SSSE3__) // set constants const __m128i lowMask = _mm_set1_epi8(0xF); @@ -11820,15 +11827,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * #endif } - -#if defined(__AVX__) -static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { - const __m128i ax = _mm_sign_epi8(x, x); - const __m128i sy = _mm_sign_epi8(y, x); - return _mm_maddubs_epi16(ax, sy); -} -#endif - #if defined(__AVX2__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3cf689bd8..6f69a4526 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2,6 +2,7 @@ #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #include "ggml.h" #include "ggml-aarch64.h" @@ -2020,10 +2021,11 @@ struct ggml_threadpool { // these are atomic as an annotation for thread-sanitizer atomic_bool stop; // Used for stopping the threadpool altogether atomic_bool pause; // Used for pausing the threadpool or individual threads + atomic_bool abort; // Used for aborting processing of a graph struct ggml_compute_state * workers; // per thread state int n_threads_max; // number of threads in the pool - int n_threads_cur; // number of threads used in the current graph + atomic_int n_threads_cur; // number of threads used in the current graph int32_t prio; // Scheduling priority uint32_t poll; // Polling level (0 - no polling) @@ -3194,41 +3196,36 @@ inline static void ggml_critical_section_start(void) { } } +static void ggml_barrier(struct ggml_threadpool * tp) { + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + #ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_threadpool * threadpool) { - if (threadpool->n_threads_cur == 1) { - return; - } - #pragma omp barrier -} #else -static void ggml_barrier(struct ggml_threadpool * threadpool) { - if (threadpool->n_threads_cur == 1) { - return; - } + int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed); - atomic_int * n_barrier = &threadpool->n_barrier; - atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst); - int n_threads = threadpool->n_threads_cur; - int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed); - - if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { + int last = 0; + if (n_barrier == (n_threads - 1)) { // last thread - atomic_store(n_barrier, 0); - atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed); + atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed); + last = 1; } else { // wait for other threads - while (true) { - if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) { - return; - } + while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { ggml_thread_cpu_relax(); } } -} + + // exit barrier (full seq-cst fence) + atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst); #endif +} // TODO: make this somehow automatically executed // some sort of "sentry" mechanism @@ -19991,34 +19988,33 @@ struct ggml_cplan ggml_graph_plan( static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * tp = state->threadpool; - const struct ggml_cgraph * cgraph = state->threadpool->cgraph; - const struct ggml_cplan * cplan = state->threadpool->cplan; + const struct ggml_cgraph * cgraph = tp->cgraph; + const struct ggml_cplan * cplan = tp->cplan; set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { /*.ith =*/ state->ith, - /*.nth =*/ state->threadpool->n_threads_cur, + /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, - /*.threadpool=*/ state->threadpool, + /*.threadpool=*/ tp, }; - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; ggml_compute_forward(¶ms, node); - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->threadpool->ec = GGML_STATUS_ABORTED; + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + tp->abort = true; + tp->ec = GGML_STATUS_ABORTED; } ggml_barrier(state->threadpool); - - if (state->threadpool->ec != GGML_STATUS_SUCCESS) { - break; - } } return 0; @@ -20026,7 +20022,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifndef GGML_USE_OPENMP -static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { +// check if thread is active +static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed); + return (state->ith < n_threads); +} + +// check if thread is ready to proceed (exit from polling or sleeping) +static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) { struct ggml_threadpool * threadpool = state->threadpool; if (state->pending || threadpool->stop || threadpool->pause) { return true; } @@ -20034,21 +20038,34 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { // check for new graph/work int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); if (new_graph != state->last_graph) { - state->pending = (state->ith < threadpool->n_threads_cur); + state->pending = ggml_graph_compute_thread_active(state); state->last_graph = new_graph; } return state->pending; } +// sync thread state after polling +static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer + // so instead we just use a dummy read-modify-write + atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst); +} + static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { struct ggml_threadpool * threadpool = state->threadpool; + // Skip polling for unused threads + if (!ggml_graph_compute_thread_active(state)) { + return state->pending; + } + // This seems to make 0 ... 100 a decent range for polling level across modern processors. // Perhaps, we can adjust it dynamically based on load and things. const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; - for (uint64_t i=0; !ggml_graph_compute_ready(state) && ithreadpool; if (ggml_graph_compute_poll_for_work(state)) { + ggml_graph_compute_thread_sync(state); return state->pending; } ggml_mutex_lock_shared(&threadpool->mutex); - while (!ggml_graph_compute_ready(state)) { + while (!ggml_graph_compute_thread_ready(state)) { // No new work. Wait for the signal. - GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith); + GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith); ggml_cond_wait(&threadpool->cond, &threadpool->mutex); } ggml_mutex_unlock_shared(&threadpool->mutex); @@ -20113,13 +20131,20 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { } // Start processing new graph -static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool) +static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads) { - // always take the mutex here because the worker threads are doing hybrid poll/wait + // Always take the mutex here because the worker threads are doing hybrid poll/wait ggml_mutex_lock(&threadpool->mutex); - atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed); + GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads); + + // Update the number of active threads + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); + + // Indicate the graph is ready to be processed + // We need the full seq-cst fence here because of the polling threads (used in thread_sync) + atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst); if (threadpool->pause) { // Update main thread prio and affinity to match the threadpool settings @@ -20178,6 +20203,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->current_chunk = 0; threadpool->stop = false; threadpool->pause = tpp->paused; + threadpool->abort = false; threadpool->workers = NULL; threadpool->n_threads_max = tpp->n_threads; threadpool->n_threads_cur = tpp->n_threads; @@ -20253,15 +20279,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl // No worker threads should be accessing the parameters below at this stage threadpool->cgraph = cgraph; threadpool->cplan = cplan; - threadpool->n_threads_cur = n_threads; threadpool->current_chunk = 0; + threadpool->abort = false; threadpool->ec = GGML_STATUS_SUCCESS; } - if (n_threads > threadpool->n_threads_max) { - GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); - } - #ifdef GGML_USE_OPENMP if (n_threads > 1) { #pragma omp parallel num_threads(n_threads) @@ -20270,17 +20292,23 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl { // update the number of threads from the actual number of threads that we got from OpenMP n_threads = omp_get_num_threads(); - threadpool->n_threads_cur = n_threads; + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); } ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); } } else { + atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); ggml_graph_compute_thread(&threadpool->workers[0]); } #else + if (n_threads > threadpool->n_threads_max) { + GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); + n_threads = threadpool->n_threads_max; + } + // Kick all threads to start the new graph - ggml_graph_compute_kickoff(threadpool); + ggml_graph_compute_kickoff(threadpool, n_threads); // This is a work thread too ggml_graph_compute_thread(&threadpool->workers[0]); diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index d0c2bb284..0193a463a 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -50,6 +50,7 @@ #include "sgemm.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER @@ -235,6 +236,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) { } #endif // __AVX512F__ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// CONSTANTS + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; +static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl); +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION @@ -933,6 +942,20 @@ class tinyBLAS_Q0_AVX { return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); } + inline __m256i load(const block_iq4_nl *b) { + return MM256_SET_M128I(load1(b), load0(b)); + } + + inline __m128i load0(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x)); + } + + inline __m128i load1(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4))); + } + inline __m256 updot(__m256i u, __m256i s) { __m256i res; #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) @@ -1159,6 +1182,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_IQ4_NL: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_iq4_nl *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + default: return false; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c87d08782..b36a60d49 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,6 +97,8 @@ class Keys: RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -112,6 +114,7 @@ class Keys: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -210,6 +213,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() @@ -219,6 +223,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + OLMOE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK2 = auto() @@ -229,6 +234,7 @@ class MODEL_ARCH(IntEnum): JAIS = auto() NEMOTRON = auto() EXAONE = auto() + GRANITE = auto() class MODEL_TENSOR(IntEnum): @@ -364,6 +370,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", @@ -373,6 +380,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK2: "deepseek2", @@ -383,6 +391,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -867,6 +876,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.MINICPM3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GEMMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1008,6 +1034,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], MODEL_ARCH.OPENELM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1186,6 +1229,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 3c95c2673..bd059b45c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -679,6 +679,12 @@ class GGUFWriter: def add_time_decay_extra_dim(self, dim: int) -> None: self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + def add_residual_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value) + + def add_embedding_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value) + def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) @@ -703,6 +709,9 @@ class GGUFWriter: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) + def add_attention_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bc9a13ee5..2ebfa2b43 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron + "model.embed_tokens", # llama-hf nemotron olmoe "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -54,7 +54,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -66,7 +66,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 + "model.norm", # llama-hf baichuan internlm2 olmoe "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -98,7 +98,7 @@ class TensorNameMap: "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf nemotron + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi @@ -142,7 +142,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -154,7 +154,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -167,7 +167,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -185,7 +185,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -229,7 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi @@ -253,7 +253,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), @@ -295,7 +295,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -327,7 +327,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( @@ -367,7 +367,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( @@ -378,7 +378,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -387,7 +387,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm diff --git a/include/llama.h b/include/llama.h index 6cafd8acb..8d326447a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -441,6 +441,7 @@ extern "C" { LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model); + LLAMA_API int32_t llama_n_head (const struct llama_model * model); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); diff --git a/src/llama-impl.h b/src/llama-impl.h index 87012617f..2bde75ec1 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3) void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); +#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5275b1d60..5299f5116 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -236,9 +236,10 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte const int n_vocab = llama_n_vocab(llama_get_model(ctx)); // TODO: do not allocate each time - std::vector cur(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); } llama_token_data_array cur_p = { diff --git a/src/llama.cpp b/src/llama.cpp index d5f2ff530..708d67ba5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -203,6 +203,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_MINICPM3, LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, @@ -211,6 +212,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK2, @@ -222,6 +224,7 @@ enum llm_arch { LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, + LLM_ARCH_GRANITE, LLM_ARCH_UNKNOWN, }; @@ -251,6 +254,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, @@ -259,6 +263,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, @@ -270,6 +275,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -309,6 +315,8 @@ enum llm_kv { LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM, + LLM_KV_RESIDUAL_SCALE, + LLM_KV_EMBEDDING_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -323,6 +331,7 @@ enum llm_kv { LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_SCALE, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -413,6 +422,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, + { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, + { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -427,6 +438,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -1044,6 +1056,29 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_MINICPM3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_GEMMA, { @@ -1178,6 +1213,26 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OLMOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_OPENELM, { @@ -1417,6 +1472,22 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, }, }, + { + LLM_ARCH_GRANITE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2266,6 +2337,7 @@ enum e_model { MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, + MODEL_A1_7B, MODEL_A2_7B, MODEL_8x7B, MODEL_8x22B, @@ -2338,6 +2410,11 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; + // Additional scale factors (Granite) + float f_residual_scale = 0.0f; + float f_embedding_scale = 0.0f; + float f_attention_scale = 0.0f; + bool causal_attn = true; bool use_alibi = false; bool attn_soft_cap = false; @@ -2400,6 +2477,9 @@ struct llama_hparams { if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; + if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; + if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; + if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; return false; } @@ -5248,6 +5328,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; case MODEL_XL: return "1.5B"; + case MODEL_A1_7B: return "A1.7B"; case MODEL_A2_7B: return "A2.7B"; case MODEL_8x7B: return "8x7B"; case MODEL_8x22B: return "8x22B"; @@ -5422,6 +5503,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + + switch (hparams.n_layer) { + case 62: model.type = e_model::MODEL_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_GROK: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5787,6 +5879,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OLMOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_A1_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_OPENELM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5983,6 +6083,20 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GRANITE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_3B; break; + // Add additional layer/vocab/etc checks here for other model sizes + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -6025,8 +6139,15 @@ static void llm_load_vocab( vocab.special_mask_id = -1; vocab.linefeed_id = -1; + // read vocab size from metadata + if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { + vocab.n_vocab = 0; + LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab); + } return; - } else if (tokenizer_model == "llama") { + } + + if (tokenizer_model == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -6699,6 +6820,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } + + if (model.arch == LLM_ARCH_GRANITE) { + LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); + LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); + LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); + } } // Returns false if cancelled by progress_callback @@ -6876,6 +7003,7 @@ static bool llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -6956,6 +7084,54 @@ static bool llm_load_tensors( } } } break; + case LLM_ARCH_MINICPM3: + { + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}); + + layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); + + layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); + + layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + } break; case LLM_ARCH_GROK: { if (n_expert == 0) { @@ -7993,6 +8169,44 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_OLMOE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + GGML_ASSERT(n_expert > 0); + GGML_ASSERT(n_expert_used > 0); + + // MoE branch + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; case LLM_ARCH_OPENELM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -8773,6 +8987,11 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_embd); } + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); + } + cb(inpL, "inp_embd", -1); return inpL; @@ -9476,7 +9695,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * cur, struct ggml_tensor * x_prev, struct ggml_tensor ** wkv_state) { - size_t n_embed = cur->ne[0]; + size_t n_embd = cur->ne[0]; size_t n_seq_tokens = cur->ne[1]; size_t n_seqs = cur->ne[2]; @@ -9487,8 +9706,8 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); @@ -9513,11 +9732,11 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( xxx ); - struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); - struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); - struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); - struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float)); - struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float)); + struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); + struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); struct ggml_tensor * xw = ggml_add( ctx, @@ -9586,7 +9805,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( ) ); - w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); + w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd)); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); @@ -9595,21 +9814,21 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( r = ggml_transpose(ctx, r); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); + cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); cur = ggml_norm(ctx, cur, 64e-5f); // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_mul(ctx, cur, g); cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); + return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); } static struct ggml_tensor * llm_build_rwkv6_channel_mix( @@ -10051,6 +10270,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10107,7 +10327,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -10118,6 +10338,11 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -10154,6 +10379,11 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); } + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -10173,6 +10403,12 @@ struct llm_build_context { // lm_head cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12920,6 +13156,215 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_minicpm3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * rope_factors = build_rope_factors(il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + struct ggml_tensor * q = NULL; + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = llm_build_norm(ctx0, q, hparams, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); + + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); + + // split into {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); + + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed", il); + + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // shared RoPE key + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(k_pe, "k_pe", il); + + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", il); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_gemma() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -13616,6 +14061,134 @@ struct llm_build_context { return gf; } + // based on the build_qwen2moe() function, changes: + // * removed shared experts + // * removed bias + // * added q, k norm + struct ggml_cgraph * build_olmoe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_openelm() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -15375,6 +15948,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_GRANITE: { result = llm.build_llama(); } break; @@ -15460,6 +16034,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_MINICPM3: + { + result = llm.build_minicpm3(); + } break; case LLM_ARCH_GEMMA: { result = llm.build_gemma(); @@ -15492,6 +16070,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_OLMOE: + { + result = llm.build_olmoe(); + } break; case LLM_ARCH_OPENELM: { result = llm.build_openelm(); @@ -16155,7 +16737,7 @@ static int llama_decode_internal( const uint32_t n_tokens_all = batch_all.n_tokens; if (n_tokens_all == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } @@ -16168,7 +16750,7 @@ static int llama_decode_internal( if (batch_all.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]); return -1; } } @@ -16456,7 +17038,7 @@ static int llama_encode_internal( const uint32_t n_tokens = batch.n_tokens; if (n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } @@ -16469,7 +17051,7 @@ static int llama_encode_internal( if (batch.token) { for (uint32_t i = 0; i < n_tokens; ++i) { if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); return -1; } } @@ -18137,9 +18719,9 @@ struct llama_model * llama_load_model_from_file( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); + LLAMA_LOG("."); if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); + LLAMA_LOG("\n"); } } return true; @@ -18611,6 +19193,10 @@ int32_t llama_n_layer(const struct llama_model * model) { return model->hparams.n_layer; } +int32_t llama_n_head(const struct llama_model * model) { + return model->hparams.n_head(); +} + const struct llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } @@ -18649,6 +19235,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: + case LLM_ARCH_GRANITE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -18662,6 +19249,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: + case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: @@ -18672,6 +19260,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_CODESHELL: case LLM_ARCH_NEMOTRON: case LLM_ARCH_EXAONE: + case LLM_ARCH_MINICPM3: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here @@ -20854,8 +21443,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l if (len < 128) { g_state.log_callback(level, buffer, g_state.log_callback_user_data); } else { - char* buffer2 = new char[len+1]; - vsnprintf(buffer2, len+1, format, args_copy); + char * buffer2 = new char[len + 1]; + vsnprintf(buffer2, len + 1, format, args_copy); buffer2[len] = 0; g_state.log_callback(level, buffer2, g_state.log_callback_user_data); delete[] buffer2; diff --git a/src/unicode.cpp b/src/unicode.cpp index 02c4b926a..46290c4be 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -5,6 +5,7 @@ #include "unicode.h" #include "unicode-data.h" +#include #include #include #include diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index f26707910..e07d09733 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -85,7 +85,7 @@ int main(void) { argv = {"binary_name", "--verbose"}; assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.verbosity == 1); + assert(params.verbosity > 1); argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp new file mode 100644 index 000000000..cf54237db --- /dev/null +++ b/tests/test-barrier.cpp @@ -0,0 +1,93 @@ +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_NARGS 2 + +int main(int argc, char *argv[]) { + + int n_threads = 4; + int n_rounds = 100; + + if (argc > 1) { + n_threads = std::atoi(argv[1]); + } + + if (argc > 2) { + n_rounds = std::atoi(argv[2]); + } + + struct ggml_init_params params = { + /* .mem_size = */ 1024*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + + struct ggml_context * ctx = ggml_init(params); + + // Create graph + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + // Lots of small, parallel ops where barriers in between will dominate + struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64); + for (int i = 0; i < 1000; i++) { + struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); + out = ggml_mul_mat(ctx, a, out); + + struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64); + out = ggml_mul_mat(ctx, d, out); + } + + ggml_build_forward_expand(gf, out); + int n_nodes = ggml_graph_n_nodes(gf); + + // Create threadpool + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads); + struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + if (!threadpool) { + fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads); + exit(1); + } + + // Create compute plan + struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); + + std::vector work_data(cplan.work_size); + cplan.work_data = work_data.data(); + + std::cerr << "graph-compute with" + << "\n n_threads: " << n_threads + << "\n n_nodes: " << n_nodes + << "\n n_rounds: " << n_rounds + << "\n"; + // ggml_graph_print(gf); + + // Warmup + ggml_graph_compute(gf, &cplan); + + auto t0 = std::chrono::high_resolution_clock::now(); + + for (int i=0; i < n_rounds; i++) { + ggml_graph_compute(gf, &cplan); + } + + auto t1 = std::chrono::high_resolution_clock::now(); + + auto usec = std::chrono::duration_cast(t1-t0).count(); + auto nsec = std::chrono::duration_cast(t1-t0).count(); + std::cerr << "graph-compute took " << usec << " usec " + << "\n " << (float) usec / n_rounds << " usec per-iter" + << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node" + << "\n"; + + ggml_threadpool_free(threadpool); + ggml_free(ctx); + + return 0; +} diff --git a/tests/test-log.cpp b/tests/test-log.cpp new file mode 100644 index 000000000..211222369 --- /dev/null +++ b/tests/test-log.cpp @@ -0,0 +1,39 @@ +#include "log.h" + +#include +#include + +int main() { + const int n_thread = 8; + + std::thread threads[n_thread]; + for (int i = 0; i < n_thread; i++) { + threads[i] = std::thread([i]() { + const int n_msg = 1000; + + for (int j = 0; j < n_msg; j++) { + const int log_type = std::rand() % 4; + + switch (log_type) { + case 0: LOG_INF("Thread %d: %d\n", i, j); break; + case 1: LOG_WRN("Thread %d: %d\n", i, j); break; + case 2: LOG_ERR("Thread %d: %d\n", i, j); break; + case 3: LOG_DBG("Thread %d: %d\n", i, j); break; + default: + break; + } + + if (rand () % 10 < 5) { + gpt_log_set_timestamps(gpt_log_main(), rand() % 2); + gpt_log_set_prefix (gpt_log_main(), rand() % 2); + } + } + }); + } + + for (int i = 0; i < n_thread; i++) { + threads[i].join(); + } + + return 0; +}