Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/server.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	ci/run.sh
#	common/CMakeLists.txt
#	common/common.cpp
#	docs/backend/SYCL.md
#	examples/embedding/embedding.cpp
#	examples/imatrix/imatrix.cpp
#	examples/infill/infill.cpp
#	examples/llama-bench/llama-bench.cpp
#	examples/main/README.md
#	examples/parallel/parallel.cpp
#	examples/perplexity/perplexity.cpp
#	examples/server/CMakeLists.txt
#	examples/server/README.md
#	examples/server/bench/README.md
#	examples/server/tests/README.md
#	examples/speculative/speculative.cpp
#	flake.lock
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	grammars/README.md
#	scripts/compare-commits.sh
#	scripts/compare-llama-bench.py
#	tests/CMakeLists.txt
This commit is contained in:
Concedo 2024-09-19 14:53:57 +08:00
commit 29625c3d2e
54 changed files with 3396 additions and 2709 deletions

View file

@ -1,15 +1,17 @@
#include "arg.h" #include "arg.h"
#include "log.h"
#include "sampling.h" #include "sampling.h"
#include <algorithm> #include <algorithm>
#include <string> #include <climits>
#include <vector> #include <cstdarg>
#include <set>
#include <fstream> #include <fstream>
#include <regex> #include <regex>
#include <cstdarg> #include <set>
#include <climits> #include <string>
#include <thread>
#include <vector>
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
exit(0); exit(0);
} }
)); ));
add_opt(llama_arg(
{"-v", "--verbose"},
"print verbose information",
[](gpt_params & params) {
params.verbosity = 1;
}
));
add_opt(llama_arg(
{"--verbosity"}, "N",
format("set specific verbosity level (default: %d)", params.verbosity),
[](gpt_params & params, int value) {
params.verbosity = value;
}
));
add_opt(llama_arg( add_opt(llama_arg(
{"--verbose-prompt"}, {"--verbose-prompt"},
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
[](gpt_params & params) { [](gpt_params & params) {
params.use_color = true; params.use_color = true;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(llama_arg( add_opt(llama_arg(
{"-t", "--threads"}, "N", {"-t", "--threads"}, "N",
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@ -697,6 +685,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.n_keep = value; params.n_keep = value;
} }
)); ));
add_opt(llama_arg(
{"--no-context-shift"},
format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](gpt_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg( add_opt(llama_arg(
{"--chunks"}, "N", {"--chunks"}, "N",
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -876,7 +871,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.input_prefix = value; params.input_prefix = value;
params.enable_chat_template = false; params.enable_chat_template = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg( add_opt(llama_arg(
{"--in-suffix"}, "STRING", {"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)", "string to suffix after user inputs with (default: empty)",
@ -884,7 +879,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.input_suffix = value; params.input_suffix = value;
params.enable_chat_template = false; params.enable_chat_template = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg( add_opt(llama_arg(
{"--no-warmup"}, {"--no-warmup"},
"skip warming up the model with an empty run", "skip warming up the model with an empty run",
@ -1317,7 +1312,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
[](gpt_params & params, int value) { [](gpt_params & params, int value) {
params.n_parallel = value; params.n_parallel = value;
} }
)); ).set_env("LLAMA_ARG_N_PARALLEL"));
add_opt(llama_arg( add_opt(llama_arg(
{"-ns", "--sequences"}, "N", {"-ns", "--sequences"}, "N",
format("number of sequences to decode (default: %d)", params.n_sequences), format("number of sequences to decode (default: %d)", params.n_sequences),
@ -1824,19 +1819,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.system_prompt = system_prompt; params.system_prompt = system_prompt;
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg(
{"--log-format"}, "{text, json}",
"log output format: json or text (default: json)",
[](gpt_params & params, const std::string & value) {
if (value == "json") {
params.log_json = true;
} else if (value == "text") {
params.log_json = false;
} else {
throw std::invalid_argument("invalid value");
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg( add_opt(llama_arg(
{"--metrics"}, {"--metrics"},
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
@ -1956,40 +1938,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
else { std::invalid_argument("invalid value"); } else { std::invalid_argument("invalid value"); }
} }
).set_examples({LLAMA_EXAMPLE_BENCH})); ).set_examples({LLAMA_EXAMPLE_BENCH}));
#ifndef LOG_DISABLE_LOGS
// TODO: make this looks less weird
add_opt(llama_arg(
{"--log-test"},
"Log test",
[](gpt_params &) { log_param_single_parse("--log-test"); }
));
add_opt(llama_arg( add_opt(llama_arg(
{"--log-disable"}, {"--log-disable"},
"Log disable", "Log disable",
[](gpt_params &) { log_param_single_parse("--log-disable"); } [](gpt_params &) {
)); gpt_log_pause(gpt_log_main());
add_opt(llama_arg( }
{"--log-enable"},
"Log enable",
[](gpt_params &) { log_param_single_parse("--log-enable"); }
));
add_opt(llama_arg(
{"--log-new"},
"Log new",
[](gpt_params &) { log_param_single_parse("--log-new"); }
));
add_opt(llama_arg(
{"--log-append"},
"Log append",
[](gpt_params &) { log_param_single_parse("--log-append"); }
)); ));
add_opt(llama_arg( add_opt(llama_arg(
{"--log-file"}, "FNAME", {"--log-file"}, "FNAME",
"Log file", "Log to file",
[](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } [](gpt_params &, const std::string & value) {
gpt_log_set_file(gpt_log_main(), value.c_str());
}
)); ));
#endif // LOG_DISABLE_LOGS add_opt(llama_arg(
{"--log-colors"},
"Enable colored logging",
[](gpt_params &) {
gpt_log_set_colors(gpt_log_main(), true);
}
).set_env("LLAMA_LOG_COLORS"));
add_opt(llama_arg(
{"-v", "--verbose", "--log-verbose"},
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](gpt_params & params) {
params.verbosity = INT_MAX;
gpt_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(llama_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
[](gpt_params & params, int value) {
params.verbosity = value;
gpt_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(llama_arg(
{"--log-prefix"},
"Enable prefx in log messages",
[](gpt_params &) {
gpt_log_set_prefix(gpt_log_main(), true);
}
).set_env("LLAMA_LOG_PREFIX"));
add_opt(llama_arg(
{"--log-timestamps"},
"Enable timestamps in log messages",
[](gpt_params &) {
gpt_log_set_timestamps(gpt_log_main(), true);
}
).set_env("LLAMA_LOG_TIMESTAMPS"));
return ctx_arg; return ctx_arg;
} }

View file

@ -3,7 +3,9 @@
#endif #endif
#include "common.h" #include "common.h"
#include "log.h"
#include "build-info.h" #include "build-info.h"
#include "log.cpp"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
@ -26,6 +28,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include <thread>
#if defined(__APPLE__) && defined(__MACH__) #if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h> #include <sys/types.h>
@ -49,7 +52,6 @@
#if defined(LLAMA_USE_CURL) #if defined(LLAMA_USE_CURL)
#include <curl/curl.h> #include <curl/curl.h>
#include <curl/easy.h> #include <curl/easy.h>
#include <thread>
#include <future> #include <future>
#endif #endif
@ -227,7 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
} }
if (!SetPriorityClass(GetCurrentProcess(), p)) { if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false; return false;
} }
@ -252,7 +254,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
} }
if (!setpriority(PRIO_PROCESS, 0, p)) { if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false; return false;
} }
return true; return true;
@ -285,14 +287,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
if (n_set && n_set < cpuparams.n_threads) { if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues. // Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
} }
} }
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-'); size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) { if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n"); LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false; return false;
} }
@ -304,7 +306,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
} else { } else {
start_i = std::stoull(range.substr(0, dash_loc)); start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) { if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n"); LOG_ERR("Start index out of bounds!\n");
return false; return false;
} }
} }
@ -314,7 +316,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
} else { } else {
end_i = std::stoull(range.substr(dash_loc + 1)); end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) { if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n"); LOG_ERR("End index out of bounds!\n");
return false; return false;
} }
} }
@ -349,7 +351,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
} else if (c >= 'A' && c <= 'F') { } else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10; id -= 'A' - 10;
} else { } else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false; return false;
} }
@ -362,6 +364,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
return true; return true;
} }
void gpt_init() {
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
gpt_log_add(gpt_log_main(), level, "%s", text);
}
}, NULL);
#ifdef NDEBUG
const char * build_type = "";
#else
const char * build_type = " (debug)";
#endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
}
std::string gpt_params_get_system_info(const gpt_params & params) { std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os; std::ostringstream os;
@ -442,6 +460,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder); s = std::move(builder);
} }
std::string string_from(bool value) {
return value ? "true" : "false";
}
std::string string_from(const std::vector<int> & values) {
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : values) {
if (first) {
first = false;
} else {
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
std::stringstream buf;
buf << "[ ";
bool first = true;
for (const auto & token : tokens) {
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
buf << " ]";
return buf.str();
}
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i) {
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf << "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
void string_process_escapes(std::string & input) { void string_process_escapes(std::string & input) {
std::size_t input_len = input.length(); std::size_t input_len = input.length();
std::size_t output_idx = 0; std::size_t output_idx = 0;
@ -482,7 +588,7 @@ void string_process_escapes(std::string & input) {
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char * sep = strchr(data, '='); const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) { if (sep == nullptr || sep - data >= 128) {
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
return false; return false;
} }
llama_model_kv_override kvo; llama_model_kv_override kvo;
@ -505,20 +611,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
} else if (std::strcmp(sep, "false") == 0) { } else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false; kvo.val_bool = false;
} else { } else {
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false; return false;
} }
} else if (strncmp(sep, "str:", 4) == 0) { } else if (strncmp(sep, "str:", 4) == 0) {
sep += 4; sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) { if (strlen(sep) > 127) {
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false; return false;
} }
strncpy(kvo.val_str, sep, 127); strncpy(kvo.val_str, sep, 127);
kvo.val_str[127] = '\0'; kvo.val_str[127] = '\0';
} else { } else {
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
return false; return false;
} }
overrides.emplace_back(std::move(kvo)); overrides.emplace_back(std::move(kvo));
@ -730,7 +836,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
} }
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
return iparams; return iparams;
} }
@ -738,7 +844,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model); llama_free_model(model);
return iparams; return iparams;
} }
@ -774,7 +880,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
loaded_la.scale = la.scale; loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) { if (loaded_la.adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); llama_free(lctx);
llama_free_model(model); llama_free_model(model);
return iparams; return iparams;
@ -786,12 +892,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
} }
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) { if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sparams.ignore_eos = false; params.sparams.ignore_eos = false;
} }
if (params.warmup) { if (params.warmup) {
LOG("warming up the model with an empty run\n"); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model); llama_token bos = llama_token_bos(model);
@ -956,7 +1062,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
int remaining_attempts = max_attempts; int remaining_attempts = max_attempts;
while (remaining_attempts > 0) { while (remaining_attempts > 0) {
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl); CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) { if (res == CURLE_OK) {
@ -964,13 +1070,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
} }
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--; remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
} }
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false; return false;
} }
@ -979,7 +1086,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) { if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__); LOG_ERR("%s: error initializing libcurl\n", __func__);
return false; return false;
} }
@ -1020,11 +1127,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
if (metadata_in.good()) { if (metadata_in.good()) {
try { try {
metadata_in >> metadata; metadata_in >> metadata;
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) { if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>(); auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) { if (previous_url != url) {
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false; return false;
} }
} }
@ -1035,12 +1142,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
last_modified = metadata.at("lastModified"); last_modified = metadata.at("lastModified");
} }
} catch (const nlohmann::json::exception & e) { } catch (const nlohmann::json::exception & e) {
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false; return false;
} }
} }
} else { } else {
fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
} }
// Send a HEAD request to retrieve the etag and last-modified headers // Send a HEAD request to retrieve the etag and last-modified headers
@ -1088,26 +1195,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
// HEAD not supported, we don't know if the file has changed // HEAD not supported, we don't know if the file has changed
// force trigger downloading // force trigger downloading
force_download = true; force_download = true;
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
} }
} }
bool should_download = !file_exists || force_download; bool should_download = !file_exists || force_download;
if (!should_download) { if (!should_download) {
if (!etag.empty() && etag != headers.etag) { if (!etag.empty() && etag != headers.etag) {
fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true; should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) { } else if (!last_modified.empty() && last_modified != headers.last_modified) {
fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true; should_download = true;
} }
} }
if (should_download) { if (should_download) {
std::string path_temporary = path + ".downloadInProgress"; std::string path_temporary = path + ".downloadInProgress";
if (file_exists) { if (file_exists) {
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) { if (remove(path.c_str()) != 0) {
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false; return false;
} }
} }
@ -1122,7 +1229,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb")); std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) { if (!outfile) {
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false; return false;
} }
@ -1153,7 +1260,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}; };
// start the download // start the download
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) { if (!was_perform_successful) {
@ -1163,7 +1270,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
long http_code = 0; long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) { if (http_code < 200 || http_code >= 400) {
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false; return false;
} }
@ -1177,10 +1284,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
{"lastModified", headers.last_modified} {"lastModified", headers.last_modified}
}); });
std::ofstream(metadata_path) << metadata.dump(4); std::ofstream(metadata_path) << metadata.dump(4);
fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) { if (rename(path_temporary.c_str(), path.c_str()) != 0) {
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false; return false;
} }
} }
@ -1195,7 +1302,7 @@ struct llama_model * llama_load_model_from_url(
const struct llama_model_params & params) { const struct llama_model_params & params) {
// Basic validation of the model_url // Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) { if (!model_url || strlen(model_url) == 0) {
fprintf(stderr, "%s: invalid model_url\n", __func__); LOG_ERR("%s: invalid model_url\n", __func__);
return NULL; return NULL;
} }
@ -1212,7 +1319,7 @@ struct llama_model * llama_load_model_from_url(
}; };
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
if (!ctx_gguf) { if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
return NULL; return NULL;
} }
@ -1232,14 +1339,12 @@ struct llama_model * llama_load_model_from_url(
// and extract split URL and PATH prefixes // and extract split URL and PATH prefixes
{ {
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
fprintf(stderr, "\n%s: unexpected model file name: %s" LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
" n_split=%d\n", __func__, path_model, n_split);
return NULL; return NULL;
} }
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
fprintf(stderr, "\n%s: unexpected model url: %s" LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
" n_split=%d\n", __func__, model_url, n_split);
return NULL; return NULL;
} }
} }
@ -1299,7 +1404,7 @@ struct llama_model * llama_load_model_from_url(
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/, const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr; return nullptr;
} }
@ -1309,7 +1414,7 @@ struct llama_model * llama_load_model_from_hf(
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/, const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr; return nullptr;
} }
@ -1637,13 +1742,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
}; };
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) { if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
return result; return result;
} }
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) { if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
} }
for (int i = 0; i < n_tensors; i++) { for (int i = 0; i < n_tensors; i++) {
@ -1661,23 +1766,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
} }
} }
if (layer_idx < 0) { if (layer_idx < 0) {
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} else if (layer_idx == 0) { } else if (layer_idx == 0) {
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} }
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) { if (tensor->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} }
if (ggml_n_dims(tensor) != 1) { if (ggml_n_dims(tensor) != 1) {
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} }
@ -1685,7 +1790,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
if (result.n_embd == -1) { if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor); result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) { } else if (ggml_nelements(tensor) != result.n_embd) {
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} }
@ -1702,7 +1807,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
} }
if (result.n_embd == -1) { if (result.n_embd == -1) {
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear(); result.data.clear();
} }
@ -1723,7 +1828,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
break; break;
} }
if (result.n_embd != -1 && result.n_embd != cur.n_embd) { if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str()); LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
result.n_embd = -1; result.n_embd = -1;
break; break;
} }
@ -1739,7 +1844,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
} }
if (result.n_embd == -1) { if (result.n_embd == -1) {
fprintf(stderr, "%s: no valid control vector files passed\n", __func__); LOG_ERR("%s: no valid control vector files passed\n", __func__);
result.data.clear(); result.data.clear();
} }

View file

@ -4,11 +4,9 @@
#include "llama.h" #include "llama.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream>
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
@ -244,6 +242,7 @@ struct gpt_params {
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
@ -339,6 +338,10 @@ struct gpt_params {
bool batched_bench_output_jsonl = false; bool batched_bench_output_jsonl = false;
}; };
// call once at the start of a program if it uses libcommon
// initializes the logging system and prints info about the build
void gpt_init();
std::string gpt_params_get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@ -374,6 +377,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
std::string string_from(bool value);
std::string string_from(const std::vector<int> & values);
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
// //
// Filesystem utils // Filesystem utils
// //

401
common/log.cpp Normal file
View file

@ -0,0 +1,401 @@
#include "log.h"
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void gpt_log_set_verbosity_thold(int verbosity) {
gpt_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
#define LOG_COL_BOLD "\033[1m"
#define LOG_COL_RED "\033[31m"
#define LOG_COL_GREEN "\033[32m"
#define LOG_COL_YELLOW "\033[33m"
#define LOG_COL_BLUE "\033[34m"
#define LOG_COL_MAGENTA "\033[35m"
#define LOG_COL_CYAN "\033[36m"
#define LOG_COL_WHITE "\033[37m"
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
// colors
enum gpt_log_col : int {
GPT_LOG_COL_DEFAULT = 0,
GPT_LOG_COL_BOLD,
GPT_LOG_COL_RED,
GPT_LOG_COL_GREEN,
GPT_LOG_COL_YELLOW,
GPT_LOG_COL_BLUE,
GPT_LOG_COL_MAGENTA,
GPT_LOG_COL_CYAN,
GPT_LOG_COL_WHITE,
};
// disable colors by default
static std::vector<const char *> g_col = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
};
struct gpt_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
fcur = stdout;
if (level != GGML_LOG_LEVEL_NONE) {
fcur = stderr;
}
}
if (level != GGML_LOG_LEVEL_NONE && prefix) {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[GPT_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[GPT_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
default:
break;
}
}
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct gpt_log {
// default capacity - will be expanded if needed
gpt_log() : gpt_log(256) {}
gpt_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
head = 0;
tail = 0;
resume();
}
~gpt_log() {
pause();
if (file) {
fclose(file);
}
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
bool prefix;
bool timestamps;
bool running;
int64_t t_start;
// ring buffer of entries
std::vector<gpt_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
gpt_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
va_list args_copy;
va_copy(args_copy, args);
#if 1
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
}
#else
// hack for bolding arguments
std::stringstream ss;
for (int i = 0; fmt[i] != 0; i++) {
if (fmt[i] == '%') {
ss << LOG_COL_BOLD;
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
ss << LOG_COL_DEFAULT;
if (fmt[i] == 0) break;
}
ss << fmt[i];
}
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
if (n >= entry.msg.size()) {
entry.msg.resize(n + 1);
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
}
#endif
}
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<gpt_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
std::lock_guard<std::mutex> lock(mtx);
if (running) {
return;
}
running = true;
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
}
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
void pause() {
{
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
return;
}
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
}
void set_file(const char * path) {
pause();
if (file) {
fclose(file);
}
if (path) {
file = fopen(path, "w");
} else {
file = nullptr;
}
resume();
}
void set_colors(bool colors) {
pause();
if (colors) {
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
resume();
}
void set_prefix(bool prefix) {
std::lock_guard<std::mutex> lock(mtx);
this->prefix = prefix;
}
void set_timestamps(bool timestamps) {
std::lock_guard<std::mutex> lock(mtx);
this->timestamps = timestamps;
}
};
//
// public API
//
struct gpt_log * gpt_log_init() {
return new gpt_log;
}
struct gpt_log * gpt_log_main() {
static struct gpt_log log;
return &log;
}
void gpt_log_pause(struct gpt_log * log) {
log->pause();
}
void gpt_log_resume(struct gpt_log * log) {
log->resume();
}
void gpt_log_free(struct gpt_log * log) {
delete log;
}
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void gpt_log_set_file(struct gpt_log * log, const char * file) {
log->set_file(file);
}
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
log->set_colors(colors);
}
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
log->set_prefix(prefix);
}
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}

View file

@ -1,724 +1,90 @@
#pragma once #pragma once
#include <chrono> #include "ggml.h" // for ggml_log_level
#include <cstring>
#include <sstream>
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cinttypes>
// -------------------------------- #ifndef __GNUC__
// # define LOG_ATTRIBUTE_FORMAT(...)
// Basic usage: #elif defined(__MINGW32__)
// # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
// --------
//
// The LOG() and LOG_TEE() macros are ready to go by default
// they do not require any initialization.
//
// LOGLN() and LOG_TEELN() are variants which automatically
// include \n character at the end of the log string.
//
// LOG() behaves exactly like printf, by default writing to a logfile.
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
//
// Default logfile is named
// "llama.<threadID>.log"
// Default LOG_TEE() secondary output target is
// stderr
//
// Logs can be dynamically disabled or enabled using functions:
// log_disable()
// and
// log_enable()
//
// A log target can be changed with:
// log_set_target( string )
// creating and opening, or re-opening a file by string filename
// or
// log_set_target( FILE* )
// allowing to point at stderr, stdout, or any valid FILE* file handler.
//
// --------
//
// End of Basic usage.
//
// --------------------------------
// Specifies a log target.
// default uses log_handler() with "llama.log" log file
// this can be changed, by defining LOG_TARGET
// like so:
//
// #define LOG_TARGET (a valid FILE*)
// #include "log.h"
//
// or it can be simply redirected to stdout or stderr
// like so:
//
// #define LOG_TARGET stderr
// #include "log.h"
//
// The log target can also be redirected to a different function
// like so:
//
// #define LOG_TARGET log_handler_different()
// #include "log.h"
//
// FILE* log_handler_different()
// {
// return stderr;
// }
//
// or:
//
// #define LOG_TARGET log_handler_another_one("somelog.log")
// #include "log.h"
//
// FILE* log_handler_another_one(char*filename)
// {
// static FILE* logfile = nullptr;
// (...)
// if( !logfile )
// {
// fopen(...)
// }
// (...)
// return logfile
// }
//
#ifndef LOG_TARGET
#define LOG_TARGET log_handler()
#endif
#ifndef LOG_TEE_TARGET
#define LOG_TEE_TARGET stderr
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
{
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf;
buf << log_file_basename;
if (_multilog)
{
buf << ".";
buf << log_get_pid();
}
buf << ".";
buf << log_file_extension;
return buf.str();
}
#ifndef LOG_DEFAULT_FILE_NAME
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
#endif
// Utility for turning #define values into string literals
// so we can have a define for stderr and
// we can print "stderr" instead of literal stderr, etc.
#define LOG_STRINGIZE1(s) #s
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
// Allows disabling timestamps.
// in order to disable, define LOG_NO_TIMESTAMPS
// like so:
//
// #define LOG_NO_TIMESTAMPS
// #include "log.h"
//
#ifndef LOG_NO_TIMESTAMPS
#ifndef _MSC_VER
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else #else
#define LOG_TIMESTAMP_FMT "%s" # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#define LOG_TIMESTAMP_VAL ,""
#endif #endif
#ifdef LOG_TEE_TIMESTAMPS #define LOG_DEFAULT_DEBUG 1
#ifndef _MSC_VER #define LOG_DEFAULT_LLAMA 0
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#else
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
#endif
#else
#define LOG_TEE_TIMESTAMP_FMT "%s"
#define LOG_TEE_TIMESTAMP_VAL ,""
#endif
// Allows disabling file/line/function prefix // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION // set via gpt_log_set_verbosity()
// like so: extern int gpt_log_verbosity_thold;
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
// the gpt_log uses an internal worker thread to print/write log messages
// when the worker thread is paused, incoming log messages are discarded
struct gpt_log;
struct gpt_log * gpt_log_init();
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
void gpt_log_free (struct gpt_log * log);
LOG_ATTRIBUTE_FORMAT(3, 4)
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
// //
// #define LOG_NO_FILE_LINE_FUNCTION // regular log output:
// #include "log.h"
// //
#ifndef LOG_NO_FILE_LINE_FUNCTION // ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
#ifndef _MSC_VER // llm_load_tensors: ggml ctx size = 0.27 MiB
#define LOG_FLF_FMT "[%24s:%5d][%24s] " // llm_load_tensors: offloading 32 repeating layers to GPU
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ // llm_load_tensors: offloading non-repeating layers to GPU
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
#define LOG_FLF_VAL ,""
#endif
#ifdef LOG_TEE_FILE_LINE_FUNCTION
#ifndef _MSC_VER
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
#define LOG_TEE_FLF_VAL ,""
#endif
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
// //
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) // with prefix = true, timestamps = true, the log output will look like this:
#define LOG_IMPL(str, ...) \ //
do { \ // 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
if (LOG_TARGET != nullptr) \ // 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
{ \ // 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ // 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
fflush(LOG_TARGET); \ //
} \ // I - info (stdout, V = 0)
// W - warning (stderr, V = 0)
// E - error (stderr, V = 0)
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
//
// for example:
//
// LOG_DBG("this is a debug message: %d\n", expensive_function());
//
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
//
#define LOG_TMPL(level, verbosity, ...) \
do { \
if ((verbosity) <= gpt_log_verbosity_thold) { \
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
} \
} while (0) } while (0)
#else
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
} while (0)
#endif
// INTERNAL, DO NOT USE #define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
// USE LOG_TEE() INSTEAD #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#else
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
{ \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \
} \
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
{ \
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \
} \
} while (0)
#endif
// The '\0' as a last argument, is a trick to bypass the silly #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
// so we can have a single macro which can be called just like printf. #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
// Main LOG macro. #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
// behaves like printf, and supports arguments the exact same way. #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
// #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
#if !defined(_MSC_VER) || defined(__clang__) #define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
// does the same as LOG
// and
// simultaneously writes stderr.
//
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized = false;
static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue)
{
// Disable primary target
_disabled = true;
}
// If previously disabled, only enable, and keep previous target
else if (disable == LogTriStateFalse)
{
_disabled = false;
}
// Otherwise, process the arguments
else if (log_current_filename != filename || log_current_target != target)
{
_initialized = false;
}
}
if (_disabled)
{
// Log is disabled
return nullptr;
}
if (_initialized)
{
// with fallback in case something went wrong
return logfile ? logfile : stderr;
}
// do the (re)initialization
if (target != nullptr)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
log_current_filename = LOG_DEFAULT_FILE_NAME;
log_current_target = target;
logfile = target;
}
else
{
if (log_current_filename != filename)
{
if (logfile != nullptr && logfile != stdout && logfile != stderr)
{
fclose(logfile);
}
}
logfile = fopen(filename.c_str(), _append ? "a" : "w");
}
if (!logfile)
{
// Verify whether the file was opened, otherwise fallback to stderr
logfile = stderr;
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
fflush(stderr);
// At this point we let the init flag be to true below, and let the target fallback to stderr
// otherwise we would repeatedly fopen() which was already unsuccessful
}
_initialized = true;
return logfile ? logfile : stderr;
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, append, disable, filename, target);
}
// Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output,
// until enabled back.
#define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
}
// Enables logs at runtime.
#define log_enable() log_enable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test()
{
log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n");
log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
log_set_target(stderr);
LOG("04 Hello World to stderr!\n");
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n");
log_set_target(stdout);
LOG("07 Hello World to stdout!\n");
log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n");
log_disable();
LOG("09 Hello World _1_ into the void!\n");
log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
log_disable();
log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n");
log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n");
#ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n");
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
LOG("19 Hello msvc LOG without arguments\n");
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
LOGLN("21 Hello msvc LOGLN without arguments\n");
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
#endif
}
inline bool log_param_single_parse(const std::string & param)
{
if ( param == "--log-test")
{
log_test();
return true;
}
if ( param == "--log-disable")
{
log_disable();
return true;
}
if ( param == "--log-enable")
{
log_enable();
return true;
}
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false;
}
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
{
if ( param == "--log-file")
{
if (!check_but_dont_parse)
{
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
}
return true;
}
return false;
}
inline void log_print_usage()
{
printf("log options:\n");
/* format
printf(" -h, --help show this help message and exit\n");*/
/* spacing
printf("__-param----------------Description\n");*/
printf(" --log-test Run simple logging test\n");
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf("\n");
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
// INTERNAL, DO NOT USE
inline void log_dump_cmdline_impl(int argc, char **argv)
{
std::stringstream buf;
for (int i = 0; i < argc; ++i)
{
if (std::string(argv[i]).find(' ') != std::string::npos)
{
buf << " \"" << argv[i] <<"\"";
}
else
{
buf << " " << argv[i];
}
}
LOGLN("Cmd:%s", buf.str().c_str());
}
#define log_tostr(var) log_var_to_string_impl(var).c_str()
inline std::string log_var_to_string_impl(bool var)
{
return var ? "true" : "false";
}
inline std::string log_var_to_string_impl(std::string var)
{
return var;
}
inline std::string log_var_to_string_impl(const std::vector<int> & var)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (auto e : var)
{
if (first)
{
first = false;
}
else
{
buf << ", ";
}
buf << std::to_string(e);
}
buf << " ]";
return buf.str();
}
template <typename C, typename T>
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (const auto & token : tokens)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
buf << " ]";
return buf.str();
}
template <typename C, typename B>
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
#ifdef LOG_DISABLE_LOGS
#undef LOG
#define LOG(...) // dummy stub
#undef LOGLN
#define LOGLN(...) // dummy stub
#undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_ENABLE
#define LOG_ENABLE() // dummy stub
#undef LOG_SET_TARGET
#define LOG_SET_TARGET(...) // dummy stub
#undef LOG_DUMP_CMDLINE
#define LOG_DUMP_CMDLINE(...) // dummy stub
#endif // LOG_DISABLE_LOGS

View file

@ -2,8 +2,11 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include <cinttypes>
#include <cstdint> #include <cstdint>
#include <cstdio>
#include <fstream> #include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) { std::vector<llama_token> & inp, int nnew, bool print_progress) {

View file

@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
} }
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
std::string result = "\tlogits "; std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);

View file

@ -1,9 +1,11 @@
#include "train.h" #include "train.h"
#include "common.h" #include "common.h"
#include <algorithm>
#include <random> #include <random>
#include <sstream> #include <sstream>
#include <functional> #include <functional>
#include <cstring>
struct random_normal_distribution { struct random_normal_distribution {
std::mt19937 gen; std::mt19937 gen;

View file

@ -132,12 +132,14 @@ class Model:
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_names_from_parts: set[str] = set() tensor_names_from_parts: set[str] = set()
if len(self.part_names) > 1: index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if index_file.is_file():
self.tensor_names = set() self.tensor_names = set()
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
logger.info(f"gguf: loading model weight map from '{index_name}'") logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(self.dir_model / index_name, "r", encoding="utf-8") as f: with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f) index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map") weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict): if weight_map is None or not isinstance(weight_map, dict):
@ -145,6 +147,7 @@ class Model:
self.tensor_names.update(weight_map.keys()) self.tensor_names.update(weight_map.keys())
else: else:
self.tensor_names = tensor_names_from_parts self.tensor_names = tensor_names_from_parts
weight_map = {}
for part_name in self.part_names: for part_name in self.part_names:
logger.info(f"gguf: loading model part '{part_name}'") logger.info(f"gguf: loading model part '{part_name}'")
@ -171,9 +174,17 @@ class Model:
data = LazyTorchTensor.from_eager(data) data = LazyTorchTensor.from_eager(data)
yield name, data yield name, data
# only verify tensor name presence; it doesn't matter if they are not in the right files # verify tensor name presence and identify potentially missing files
if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
if len(extra) == 0 and len(missing_files) > 0:
raise ValueError(f"Missing or incomplete model files: {missing_files}")
else:
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
if key not in gguf.MODEL_TENSORS[self.model_arch]: if key not in gguf.MODEL_TENSORS[self.model_arch]:
@ -1841,6 +1852,60 @@ class MiniCPMModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("MiniCPM3ForCausalLM")
class MiniCPM3Model(Model):
model_arch = gguf.MODEL_ARCH.MINICPM3
def set_gguf_parameters(self):
hparams = self.hparams
rope_dims = hparams["qk_rope_head_dim"]
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
return
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
def set_vocab(self):
self._set_vocab_llama_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
@Model.register("QWenLMHeadModel") @Model.register("QWenLMHeadModel")
class QwenModel(Model): class QwenModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN model_arch = gguf.MODEL_ARCH.QWEN
@ -2944,6 +3009,66 @@ class OlmoModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("OlmoeForCausalLM")
class OlmoeModel(Model):
model_arch = gguf.MODEL_ARCH.OLMOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
_experts: list[dict[str, Tensor]] | None = None
# Copied from: Qwen2MoeModel
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
# Copied from: Qwen2MoeModel
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("JinaBertModel", "JinaBertForMaskedLM") @Model.register("JinaBertModel", "JinaBertForMaskedLM")
class JinaBertV2Model(BertModel): class JinaBertV2Model(BertModel):
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@ -3955,6 +4080,36 @@ class ExaoneModel(Model):
super().prepare_tensors() super().prepare_tensors()
@Model.register("GraniteForCausalLM")
class GraniteModel(LlamaModel):
"""Conversion for IBM's GraniteForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE
def set_gguf_parameters(self):
"""Granite uses standard llama parameters with the following differences:
- No head_dim support
- New multiplier params:
- attention_scale
- embedding_scale
- residual_scale
- logits_scaling
"""
if head_dim := self.hparams.pop("head_dim", None):
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
super().set_gguf_parameters()
# NOTE: Convert _multiplier params to _scale params for naming
# consistency
if attention_scale := self.hparams.get("attention_multiplier"):
self.gguf_writer.add_attention_scale(attention_scale)
if embedding_scale := self.hparams.get("embedding_multiplier"):
self.gguf_writer.add_embedding_scale(embedding_scale)
if residual_scale := self.hparams.get("residual_multiplier"):
self.gguf_writer.add_residual_scale(residual_scale)
if logits_scaling := self.hparams.get("logits_scaling"):
self.gguf_writer.add_logit_scale(logits_scaling)
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######
# tree of lazy tensors # tree of lazy tensors

View file

@ -1,5 +1,6 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
@ -8,9 +9,9 @@
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -20,6 +21,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
int is_pp_shared = params.is_pp_shared; int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = params.n_pp; std::vector<int> n_pp = params.n_pp;
@ -76,7 +79,7 @@ int main(int argc, char ** argv) {
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
if (ret != 0) { if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false; return false;
} }
@ -93,17 +96,17 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
if (!params.batched_bench_output_jsonl) { if (!params.batched_bench_output_jsonl) {
LOG_TEE("\n"); LOG("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n"); LOG("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
} }
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@ -133,7 +136,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -155,7 +158,7 @@ int main(int argc, char ** argv) {
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
} }
@ -173,20 +176,20 @@ int main(int argc, char ** argv) {
const float speed = n_kv / t; const float speed = n_kv / t;
if(params.batched_bench_output_jsonl) { if(params.batched_bench_output_jsonl) {
LOG_TEE( LOG(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
); );
} else { } else {
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
} }
} }
} }
} }
LOG_TEE("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
llama_batch_free(batch); llama_batch_free(batch);
@ -196,7 +199,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -1,5 +1,6 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
@ -8,9 +9,9 @@
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -23,6 +24,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
// number of parallel batches // number of parallel batches
int n_parallel = params.n_parallel; int n_parallel = params.n_parallel;
@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1; return 1;
} }
@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1; return 1;
} }
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens // make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) { if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1; return 1;
} }
// print the prompt token-by-token // print the prompt token-by-token
fprintf(stderr, "\n"); LOG("\n");
for (auto id : tokens_list) { for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr);
// create a llama_batch // create a llama_batch
// we use this object to submit token data for decoding // we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
if (llama_model_has_encoder(model)) { if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) { if (llama_encode(ctx, batch)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
//} //}
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
} }
// main loop // main loop
@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1; i_batch[i] = -1;
LOG_TEE("\n"); LOG("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
} }
continue; continue;
@ -185,8 +185,7 @@ int main(int argc, char ** argv) {
// if there is only one stream, we print immediately to stdout // if there is only one stream, we print immediately to stdout
if (n_parallel == 1) { if (n_parallel == 1) {
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
} }
streams[i] += llama_token_to_piece(ctx, new_token_id); streams[i] += llama_token_to_piece(ctx, new_token_id);
@ -208,27 +207,25 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model // evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1; return 1;
} }
} }
LOG_TEE("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
LOG_TEE("\n"); LOG("\n");
for (int32_t i = 0; i < n_parallel; ++i) { for (int32_t i = 0; i < n_parallel; ++i) {
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
} }
} }
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG("\n");
llama_perf_sampler_print(smpl); llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx); llama_perf_context_print(ctx);

View file

@ -9,6 +9,7 @@
#include <climits> #include <climits>
#include <cstring> #include <cstring>
#include <cstdarg> #include <cstdarg>
#include <cinttypes>
#include <ctime> #include <ctime>
#include <random> #include <random>
#include <stdexcept> #include <stdexcept>
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try { try {
w->token_embedding_table.resize(p->vocab_size * p->dim); w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim); w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim); w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) { if (shared_weights) {
w->wcls = {}; w->wcls = {};
} else { } else {
w->wcls.resize(p->vocab_size * p->dim); w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
} }
} }
catch (std::length_error &) { catch (std::length_error &) {
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
fseek(f, 0, SEEK_END); fseek(f, 0, SEEK_END);
auto end = ftell(f); auto end = ftell(f);
if (curr != end) { if (curr != end) {
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1; return 1;
} }
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
} }
static void print_sample_weights(TransformerWeights *w){ static void print_sample_weights(TransformerWeights *w){
LOG("----- Quick print of first of the weight vales of all the variables\n"); LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]); LOG_INF("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]); LOG_INF("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]); LOG_INF("%f\n", w->rms_ffn_weight[0]);
LOG("%f\n", w->wq[0]); LOG_INF("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]); LOG_INF("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]); LOG_INF("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]); LOG_INF("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]); LOG_INF("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]); LOG_INF("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]); LOG_INF("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]); LOG_INF("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -318,20 +319,20 @@ struct train_params {
}; };
static void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd); LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult); LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head); LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff); LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer); LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot); LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
} }
static void print_tensor_info(const struct ggml_context * ctx) { static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__); LOG_INF("%s: Allocating ", __func__);
int64_t total = 1; int64_t total = 1;
int i = 0; int i = 0;
for (; i < ggml_n_dims(t); ++i) { for (; i < ggml_n_dims(t); ++i) {
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
gguf_free(ctx); gguf_free(ctx);
} else { } else {
// assume llama2.c vocabulary // assume llama2.c vocabulary
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (!file.fp) { if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename); die_fmt("%s: %s", strerror(errno), filename);
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_init();
struct train_params params = get_default_train_params(); struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, &params)) { if (!params_parse(argc, argv, &params)) {
return 1; return 1;
} }
log_set_target(stdout);
Config config; Config config;
TransformerWeights weights = {}; TransformerWeights weights = {};
{ {
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb"); FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) { if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1; return 1;
} }
// read in the config header // read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) { if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1; return 1;
} }
auto shared_weights = config.vocab_size > 0; auto shared_weights = config.vocab_size > 0;
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
// read in the Transformer weights // read in the Transformer weights
alloc_weights(&weights, &config, shared_weights); alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1; return 1;
} }
fclose(file); fclose(file);
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model); model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx); ggml_free(model.ctx);
return 0; return 0;

View file

@ -13,14 +13,15 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#include <algorithm>
#include <climits>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <climits>
////////////////////////////////////////////////// //////////////////////////////////////////////////

View file

@ -1,12 +1,11 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
#include <cstdio> #include <cstdio>
#include <random>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
/** /**
@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
GGML_ASSERT(n > 0); GGML_ASSERT(n > 0);
float sum = 0; float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) { for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n"); LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) { for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) { if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n"); LOG(" ..., \n");
i2 = ne[2] - n; i2 = ne[2] - n;
} }
printf(" [\n"); LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) { for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) { if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n"); LOG(" ..., \n");
i1 = ne[1] - n; i1 = ne[1] - n;
} }
printf(" ["); LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) { for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) { if (i0 == n && ne[0] > 2*n) {
printf("..., "); LOG("..., ");
i0 = ne[0] - n; i0 = ne[0] - n;
} }
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
printf("%12.4f", v); LOG("%12.4f", v);
sum += v; sum += v;
if (i0 < ne[0] - 1) printf(", "); if (i0 < ne[0] - 1) LOG(", ");
} }
printf("],\n"); LOG("],\n");
} }
printf(" ],\n"); LOG(" ],\n");
} }
printf(" ]\n"); LOG(" ]\n");
printf(" sum = %f\n", sum); LOG(" sum = %f\n", sum);
} }
} }
@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
} }
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t), t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(), src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "", src1 ? src1_str : "",
ggml_ne_string(t).c_str()); ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed // copy the data from the GPU memory if needed
@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
} }
@ -149,7 +148,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
print_build_info(); gpt_init();
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model; llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); LOG_ERR("%s : failed to init\n", __func__);
return 1; return 1;
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
} }
bool OK = run(ctx, params); bool OK = run(ctx, params);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
LOG_TEE("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
llama_free(ctx); llama_free(ctx);

View file

@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
g_verbose = (params.verbosity == 1); g_verbose = (params.verbosity > 1);
try { try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge(); ctx.run_merge();

View file

@ -153,7 +153,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
throw std::invalid_argument("error: invalid parameter for argument: " + arg); throw std::invalid_argument("error: invalid parameter for argument: " + arg);
} }
if (argc - arg_idx < 2) { if (argc - arg_idx != 2) {
throw std::invalid_argument("error: bad arguments"); throw std::invalid_argument("error: bad arguments");
} }
@ -390,10 +390,17 @@ static void gguf_merge(const split_params & split_params) {
int n_split = 1; int n_split = 1;
int total_tensors = 0; int total_tensors = 0;
auto * ctx_out = gguf_init_empty(); // avoid overwriting existing output file
if (std::ifstream(split_params.output.c_str())) {
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
exit(EXIT_FAILURE);
}
std::ofstream fout(split_params.output.c_str(), std::ios::binary); std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto * ctx_out = gguf_init_empty();
std::vector<uint8_t> read_data; std::vector<uint8_t> read_data;
std::vector<ggml_context *> ctx_metas; std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs; std::vector<gguf_context *> ctx_ggufs;

View file

@ -158,6 +158,8 @@ int main(int argc, char * argv[]) {
return 1; return 1;
} }
gpt_init();
llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context_params cparams = llama_context_params_from_gpt_params(params);

View file

@ -3,7 +3,6 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -43,6 +42,11 @@
#include <cinttypes> #include <cinttypes>
#include <limits> #include <limits>
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
//#define CLIP_DEBUG_FUNCTIONS //#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image // RGB uint8 image
@ -168,7 +172,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
static int get_key_idx(const gguf_context * ctx, const char * key) { static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key); int i = gguf_find_key(ctx, key);
if (i == -1) { if (i == -1) {
LOG_TEE("key %s not found in file\n", key); LOG_ERR("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key)); throw std::runtime_error(format("Missing required key: %s", key));
} }
@ -273,7 +277,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor); size_t tensor_size = ggml_nbytes(tensor);
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size, prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
} }
@ -291,7 +295,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -310,7 +314,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -571,7 +575,7 @@ struct clip_ctx {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
} }
@ -585,7 +589,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (load_image_size == nullptr) { if (load_image_size == nullptr) {
load_image_size = clip_image_size_init(); load_image_size = clip_image_size_init();
} }
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width; image_size_width = load_image_size->width;
image_size_height = load_image_size->height; image_size_height = load_image_size->height;
if (is_inf) { if (is_inf) {
@ -1050,21 +1054,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const int idx_name = gguf_find_key(ctx, KEY_NAME); const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name); const std::string name = gguf_get_val_str(ctx, idx_name);
LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); LOG_INF("%s: model name: %s\n", __func__, name.c_str());
} }
LOG_TEE("%s: description: %s\n", __func__, description.c_str()); LOG_INF("%s: description: %s\n", __func__, description.c_str());
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
LOG_TEE("\n"); LOG_INF("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname); __func__, n_kv, n_tensors, fname);
{ {
std::map<enum ggml_type, uint32_t> n_type; std::map<enum ggml_type, uint32_t> n_type;
@ -1075,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
n_type[type]++; n_type[type]++;
} }
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i); const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -1091,7 +1095,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
replace_all(value, "\n", "\\n"); replace_all(value, "\n", "\\n");
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -1100,7 +1104,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
continue; continue;
} }
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
@ -1115,7 +1119,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size; model_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
@ -1142,27 +1146,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
LOG_TEE("%s: CLIP using CUDA backend\n", __func__); LOG_INF("%s: CLIP using CUDA backend\n", __func__);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init(); new_clip->backend = ggml_backend_metal_init();
LOG_TEE("%s: CLIP using Metal backend\n", __func__); LOG_INF("%s: CLIP using Metal backend\n", __func__);
#endif #endif
#ifdef GGML_USE_CANN #ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0); new_clip->backend = ggml_backend_cann_init(0);
LOG_TEE("%s: CLIP using CANN backend\n", __func__); LOG_INF("%s: CLIP using CANN backend\n", __func__);
#endif #endif
#ifdef GGML_USE_VULKAN #ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0); new_clip->backend = ggml_backend_vk_init(0);
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif #endif
if (!new_clip->backend) { if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init(); new_clip->backend = ggml_backend_cpu_init();
LOG_TEE("%s: CLIP using CPU backend\n", __func__); LOG_INF("%s: CLIP using CPU backend\n", __func__);
} }
// model size and capabilities // model size and capabilities
@ -1197,16 +1201,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) { if (verbosity >= 1) {
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
} }
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors // load tensors
{ {
@ -1219,7 +1223,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_data = ggml_init(params); new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) { if (!new_clip->ctx_data) {
LOG_TEE("%s: ggml_init() failed\n", __func__); LOG_ERR("%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1227,7 +1231,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) { if (!fin) {
LOG_TEE("cannot open model file for loading tensors\n"); LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1249,7 +1253,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1320,23 +1324,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
if (verbosity >= 2) { if (verbosity >= 2) {
LOG_TEE("\n%s: vision model hparams\n", __func__); LOG_INF("\n%s: vision model hparams\n", __func__);
LOG_TEE("image_size %d\n", hparams.image_size); LOG_INF("image_size %d\n", hparams.image_size);
LOG_TEE("patch_size %d\n", hparams.patch_size); LOG_INF("patch_size %d\n", hparams.patch_size);
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
LOG_TEE("v_n_head %d\n", hparams.n_head); LOG_INF("v_n_head %d\n", hparams.n_head);
LOG_TEE("v_n_layer %d\n", hparams.n_layer); LOG_INF("v_n_layer %d\n", hparams.n_layer);
LOG_TEE("v_eps %f\n", hparams.eps); LOG_INF("v_eps %f\n", hparams.eps);
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
LOG_TEE("v_image_grid_pinpoints: "); LOG_INF("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
} }
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
} }
@ -1374,7 +1378,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) { } catch(const std::exception& /*e*/) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__); LOG_ERR("%s: failed to load vision model tensors\n", __func__);
} }
// LLaVA projection // LLaVA projection
@ -1403,7 +1407,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} catch (std::runtime_error & /*e*/) { } } catch (std::runtime_error & /*e*/) { }
try { try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { } } catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection // MobileVLM projection
@ -1504,7 +1508,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
return new_clip; return new_clip;
@ -1555,7 +1559,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3); auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1613,7 +1617,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
LOG_TEE("%s: failed to decode image bytes\n", __func__); LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false; return false;
} }
@ -1622,13 +1626,13 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
//check if image needs downscaling //check if image needs downscaling
if (nx > maxdims || ny > maxdims) { if (nx > maxdims || ny > maxdims) {
LOG_TEE("\nImage requires resizing: original size %d x %d scaling to max %d px\n",nx,ny,maxdims); printf("\nImage requires resizing: original size %d x %d scaling to max %d px\n",nx,ny,maxdims);
uint8_t* resized_image = scale_down_image(data, nx, ny, nc, maxdims, maxdims); uint8_t* resized_image = scale_down_image(data, nx, ny, nc, maxdims, maxdims);
if(resized_image!=nullptr) if(resized_image!=nullptr)
{ {
stbi_image_free(data); // Free the original image buffer and assign the new one stbi_image_free(data); // Free the original image buffer and assign the new one
data = resized_image; data = resized_image;
LOG_TEE("Resized to clamped to %d x %d\n",nx,ny); printf("Resized to clamped to %d x %d\n",nx,ny);
} }
} }
@ -1646,7 +1650,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
} }
if (need_letterbox) { if (need_letterbox) {
LOG_TEE("\nImage requires letterboxing: %d x %d changed to %d x %d\n",nx,ny,new_width, new_height); printf("\nImage requires letterboxing: %d x %d changed to %d x %d\n",nx,ny,new_width, new_height);
uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height); uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height);
if(letterboxed_image!=nullptr) if(letterboxed_image!=nullptr)
{ {
@ -1845,7 +1849,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -1963,7 +1967,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
const int multiple = fmin(ceil(ratio), max_slice_nums); const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images; std::vector<std::vector<clip_image_u8 *>> images;
LOG_TEE("%s: multiple %d\n", __func__, multiple); LOG_INF("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>()); images.push_back(std::vector<clip_image_u8 *>());
if (multiple <= 1) { if (multiple <= 1) {
@ -1978,17 +1982,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
clip_image_u8 * source_image = clip_image_u8_init(); clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second); bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image); images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 * refine_image = clip_image_u8_init(); clip_image_u8 * refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches // split_to_patches
int width = refine_image->nx; int width = refine_image->nx;
@ -2045,7 +2049,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
int idx = 0; int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i) { for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) { for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init(); clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
res_imgs->data[idx++] = *res; res_imgs->data[idx++] = *res;
@ -2057,7 +2061,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
@ -2134,7 +2138,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
for (size_t i = 0; i < patches.size(); i++) { for (size_t i = 0; i < patches.size(); i++) {
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]); clip_image_u8_free(patches[i]);
} }
@ -2370,7 +2374,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -2382,7 +2386,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_ERR("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -2612,7 +2616,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
new_type = type; new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
} }
const size_t n_elms = ggml_nelements(cur); const size_t n_elms = ggml_nelements(cur);
float * f32_data; float * f32_data;
@ -2631,7 +2635,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data(); f32_data = (float *)conv_buf.data();
break; break;
default: default:
LOG_TEE("Please use an input file in f32 or f16\n"); LOG_ERR("Please use an input file in f32 or f16\n");
gguf_free(ctx_out); gguf_free(ctx_out);
return false; return false;
} }
@ -2658,7 +2662,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0); fout.put(0);
} }
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }
@ -2674,8 +2678,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out); gguf_free(ctx_out);
{ {
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
} }
return true; return true;

View file

@ -10,6 +10,7 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <vector> #include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) { static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -75,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end; size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL; return NULL;
} }
@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) { if (!embed) {
LOG_TEE("%s: could not load image from base64 string.\n", __func__); LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL; return NULL;
} }
@ -114,9 +115,9 @@ struct llava_context {
}; };
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\n example usage:\n"); LOG("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt; auto prompt = params->prompt;
if (prompt_contains_image(prompt)) { if (prompt_contains_image(prompt)) {
if (!params->image.empty()) { if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n"); LOG_INF("using base64 encoded image instead of command line image path\n");
} }
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) { if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__); LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL; return NULL;
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} else { } else {
@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} }
@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response // generate the response
LOG_TEE("\n"); LOG("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
if (!smpl) { if (!smpl) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1); exit(1);
} }
@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp); LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
} }
gpt_sampler_free(smpl); gpt_sampler_free(smpl);
printf("\n"); LOG("\n");
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(gpt_params * params) {
@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model; return model;
@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip; ctx_llava->ctx_clip = ctx_clip;
@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free(); llama_backend_free();
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
@ -283,27 +278,23 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS gpt_init();
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv); print_usage(argc, argv);
return 1; return 1;
} }
auto model = llava_init(&params);
auto * model = llava_init(&params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__); fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1; return 1;
} }
if (prompt_contains_image(params.prompt)) { if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model); auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, ""); auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt // process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
@ -314,11 +305,11 @@ int main(int argc, char ** argv) {
llava_free(ctx_llava); llava_free(ctx_llava);
} else { } else {
for (auto & image : params.image) { for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model); auto * ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image); auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) { if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1; return 1;
} }

View file

@ -1,13 +1,23 @@
#include "clip.h" #include "clip.h"
#include "common.h"
#include "llama.h"
#include "llava.h" #include "llava.h"
#include "base64.hpp"
#include "llama.h"
#include <algorithm>
#include <cerrno>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <limits>
#include <vector> #include <vector>
#include <numeric>
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image // RGB uint8 image
struct clip_image_u8 { struct clip_image_u8 {
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0; img_res_v.size = 0;
img_res_v.data = nullptr; img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
LOG_TEE("%s: unable to preprocess image\n", __func__); LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data; delete[] img_res_v.data;
return false; return false;
} }
@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
} }
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
const int64_t t_img_enc_steop_batch_us = ggml_time_us(); const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0; int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) { for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img->nx; load_image_size->width = img->nx;
load_image_size->height = img->ny; load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
} }
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding // flat / default llava-1.5 type embedding
@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data; delete[] img_res_v.data;
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image\n"); LOG_ERR("Unable to encode image\n");
return false; return false;
} }
@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip); const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp"); // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
} }
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true; return true;
} }
@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip); auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) { if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false; return false;
} }
return true; return true;
@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
} }
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n"); LOG_ERR("Unable to allocate memory for image embeddings\n");
return false; return false;
} }
int n_img_pos; int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__); LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd); free(image_embd);
return false; return false;
} }
@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL; return NULL;
} }
@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) { if (!image_embed_result) {
clip_image_u8_free(img); clip_image_u8_free(img);
LOG_TEE("%s: coulnd't embed the image\n", __func__); LOG_ERR("%s: coulnd't embed the image\n", __func__);
return NULL; return NULL;
} }
@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb"); auto file = fopen(path, "rb");
if (file == NULL) { if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path); LOG_ERR("%s: can't read file %s\n", __func__, path);
return false; return false;
} }
@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) { if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error"); perror("Memory allocation error");
fclose(file); fclose(file);
return false; return false;
@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length; long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) { if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path); LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL; return NULL;
} }

View file

@ -7,9 +7,12 @@
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
#include <algorithm>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring>
#include <vector> #include <vector>
#include <iostream> // TODO: remove me
struct llava_context { struct llava_context {
struct clip_ctx * ctx_clip = NULL; struct clip_ctx * ctx_clip = NULL;
@ -18,14 +21,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
} }
static struct llama_model * llava_init(gpt_params * params) { static struct llama_model * llava_init(gpt_params * params) {
@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model; return model;
@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) { if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048" // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048; ctx_params.n_ctx = 2048;
} else { } else {
ctx_params.n_ctx = params->n_ctx; ctx_params.n_ctx = params->n_ctx;
@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model; ctx_llava->model = model;
@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
if (prompt.empty()) { if (prompt.empty()) {
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip; return ctx_clip;
} }
@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -125,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed; slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n"; system_prompt = "<|im_start|>user\n";
} }
LOG_TEE("%s: image token past: %d\n", __func__, n_past); LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
} }
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
} }
LOG_TEE("%s: image token past: %d\n", __func__, n_past); LOG_INF("%s: image token past: %d\n", __func__, n_past);
} }
static const char * sample(struct gpt_sampler * smpl, static const char * sample(struct gpt_sampler * smpl,
@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
} }
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params); auto * ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) { if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL; return NULL;
} }
// process the prompt // process the prompt
if (params->prompt.empty() && params->interactive == false) { if (params->prompt.empty() && params->interactive == false) {
LOG_TEE("prompt should be given or interactive mode should be on"); LOG_ERR("prompt should be given or interactive mode should be on");
return NULL; return NULL;
} }
auto model = llava_init(params); auto * model = llava_init(params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL; return NULL;
} }
const int64_t t_llava_init_start_us = ggml_time_us(); const int64_t t_llava_init_start_us = ggml_time_us();
auto ctx_llava = llava_init_context(params, model); auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip; ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us(); const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us(); const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past); process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us(); const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds); llava_image_embed_free(embeds);
return ctx_llava; return ctx_llava;
} }
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt; std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) { if (!is_first) {
@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
// generate the response // generate the response
LOG_TEE("\n"); LOG_INF("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
return smpl; return smpl;
@ -259,12 +256,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS gpt_init();
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty())) { if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
@ -273,21 +265,23 @@ int main(int argc, char ** argv) {
for (auto & image : params.image) { for (auto & image : params.image) {
int n_past = 0; int n_past = 0;
auto ctx_llava = minicpmv_init(&params, image, n_past); auto * ctx_llava = minicpmv_init(&params, image, n_past);
if (!params.prompt.empty()) { if (!params.prompt.empty()) {
LOG_TEE("<user>%s\n", params.prompt.c_str()); LOG("<user>%s\n", params.prompt.c_str());
LOG_TEE("<assistant>"); LOG("<assistant>");
auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true); auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = ""; std::string response;
bool have_tmp = false; bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, smpl, n_past); const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0){ if (strcmp(tmp, "</s>") == 0){
if(!have_tmp)continue; if (!have_tmp) {
else break; continue;
}
break;
} }
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true; have_tmp = true;
@ -299,15 +293,15 @@ int main(int argc, char ** argv) {
gpt_sampler_free(smpl); gpt_sampler_free(smpl);
}else { }else {
while (true) { while (true) {
LOG_TEE("<user>"); LOG("<user>");
std::string prompt; std::string prompt;
std::getline(std::cin, prompt); std::getline(std::cin, prompt);
LOG_TEE("<assistant>"); LOG("<assistant>");
auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true); auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
std::string response = ""; std::string response;
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
auto tmp = llama_loop(ctx_llava, smpl, n_past); const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp; response += tmp;
if (strcmp(tmp, "</s>") == 0) break; if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior if (strstr(tmp, "###")) break; // Yi-VL behavior

View file

@ -1,6 +1,7 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h" #include "sampling.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>
@ -42,18 +43,14 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
const int W = 15; // lookahead window const int W = 15; // lookahead window
const int N = 5; // n-gram size const int N = 5; // n-gram size
const int G = 15; // max verification n-grams const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache; const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookahead", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) { if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1; return 1;
} }
fprintf(stderr, "\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -166,7 +163,7 @@ int main(int argc, char ** argv) {
{ {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
fflush(stdout); fflush(stdout);
} }
} }
@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
return 1; return 1;
} }
@ -293,10 +290,10 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
if (v == 0) { if (v == 0) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} else { } else {
// print light cyan // print light cyan
printf("\033[0;96m%s\033[0m", token_str.c_str()); LOG("\033[0;96m%s\033[0m", token_str.c_str());
} }
fflush(stdout); fflush(stdout);
@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
// print known n-grams starting with token id (debug) // print known n-grams starting with token id (debug)
if (0 && v == 0) { if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) { if (ngrams_observed.cnt[id] > 0) {
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
} }
for (int i = 0; i < ngrams_observed.cnt[id]; i++) { for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
printf(" - ngram %2d: ", i); LOG(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1); const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) { for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
printf("\n"); LOG("\n");
} }
} }
@ -455,20 +452,20 @@ int main(int argc, char ** argv) {
auto t_dec_end = ggml_time_us(); auto t_dec_end = ggml_time_us();
LOG_TEE("\n\n"); LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("W = %2d\n", W); LOG_INF("W = %2d\n", W);
LOG_TEE("N = %2d\n", N); LOG_INF("N = %2d\n", N);
LOG_TEE("G = %2d\n", G); LOG_INF("G = %2d\n", G);
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_predict = %d\n", n_predict); LOG_INF("n_predict = %d\n", n_predict);
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("\n"); LOG_INF("\n");
gpt_perf_print(ctx, smpl); gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl); gpt_sampler_free(smpl);
@ -482,7 +479,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -5,13 +5,12 @@
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
#include <cmath>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <cinttypes>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_map>
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
@ -20,6 +19,8 @@ int main(int argc, char ** argv){
return 1; return 1;
} }
gpt_init();
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
// init llama.cpp // init llama.cpp
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
} }
} }
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
} }
// After each chunk, update the dynamic ngram cache with the context ngram cache: // After each chunk, update the dynamic ngram cache with the context ngram cache:
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
ngram_cache_context.clear(); ngram_cache_context.clear();
} }
LOG_TEE("\n"); LOG("\n");
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_draft = %d\n", n_draft); LOG_INF("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_INF("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -3,6 +3,7 @@
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "sampling.h" #include "sampling.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cstdint> #include <cstdint>
@ -18,17 +19,13 @@ int main(int argc, char ** argv){
return 1; return 1;
} }
gpt_init();
// max. number of additional tokens to draft if match is found // max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft; const int n_draft = params.n_draft;
const bool dump_kv_cache = params.dump_kv_cache; const bool dump_kv_cache = params.dump_kv_cache;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("lookup", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
// init llama.cpp // init llama.cpp
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -58,7 +55,7 @@ int main(int argc, char ** argv){
try { try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) { } catch (std::ifstream::failure const &) {
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1); exit(1);
} }
} }
@ -76,14 +73,14 @@ int main(int argc, char ** argv){
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) { if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1; return 1;
} }
fprintf(stderr, "\n\n"); LOG("\n\n");
for (auto id : inp) { for (auto id : inp) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr); fflush(stderr);
@ -124,7 +121,7 @@ int main(int argc, char ** argv){
} }
// print current draft sequence // print current draft sequence
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
@ -136,7 +133,7 @@ int main(int argc, char ** argv){
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id);
if (!params.use_color) { if (!params.use_color) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
if (llama_token_is_eog(model, id)) { if (llama_token_is_eog(model, id)) {
@ -147,7 +144,7 @@ int main(int argc, char ** argv){
// check if the target token matches the draft // check if the target token matches the draft
if (i_dft < (int) draft.size() && id == draft[i_dft]) { if (i_dft < (int) draft.size() && id == draft[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept; ++n_accept;
++n_past; ++n_past;
++i_dft; ++i_dft;
@ -161,19 +158,19 @@ int main(int argc, char ** argv){
if (params.use_color) { if (params.use_color) {
// color accepted draft token // color accepted draft token
printf("\033[34m%s\033[0m", token_str.c_str()); LOG("\033[34m%s\033[0m", token_str.c_str());
fflush(stdout); fflush(stdout);
} }
continue; continue;
} }
if (params.use_color) { if (params.use_color) {
printf("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
fflush(stdout); fflush(stdout);
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
draft.clear(); draft.clear();
draft.push_back(id); draft.push_back(id);
@ -224,22 +221,22 @@ int main(int argc, char ** argv){
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
LOG_TEE("\n\n"); LOG("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("n_draft = %d\n", n_draft); LOG_INF("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict); LOG_INF("n_predict = %d\n", n_predict);
LOG_TEE("n_drafted = %d\n", n_drafted); LOG_INF("n_drafted = %d\n", n_drafted);
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_TEE("n_accept = %d\n", n_accept); LOG_INF("n_accept = %d\n", n_accept);
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
LOG_TEE("\ntarget:\n\n"); LOG_INF("\ntarget:\n\n");
gpt_perf_print(ctx, smpl); gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl); gpt_sampler_free(smpl);
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); LOG("\n\n");
return 0; return 0;
} }

View file

@ -1,13 +1,12 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "log.h"
#include "sampling.h" #include "sampling.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "build-info.h"
#include <cassert> #include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
@ -43,11 +42,13 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
static bool need_insert_eot = false; static bool need_insert_eot = false;
static void print_usage(int, char ** argv) { static void print_usage(int argc, char ** argv) {
printf("\nexample usage:\n"); (void) argc;
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); LOG("\nexample usage:\n");
printf("\n"); LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
LOG("\n");
} }
static bool file_exists(const std::string & path) { static bool file_exists(const std::string & path) {
@ -75,8 +76,7 @@ static void write_logfile(
const bool success = fs_create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
__func__, params.logdir.c_str());
return; return;
} }
@ -84,7 +84,7 @@ static void write_logfile(
FILE * logfile = fopen(logfile_path.c_str(), "w"); FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) { if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return; return;
} }
@ -114,7 +114,7 @@ static void sigint_handler(int signo) {
need_insert_eot = true; need_insert_eot = true;
} else { } else {
console::cleanup(); console::cleanup();
printf("\n"); LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl); gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130); _exit(130);
@ -123,17 +123,11 @@ static void sigint_handler(int signo) {
} }
#endif #endif
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
llama_chat_msg new_msg{role, content}; llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content}); chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str()); LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted; return formatted;
} }
@ -144,55 +138,46 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
auto & sparams = params.sparams; auto & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); }); atexit([]() { console::cleanup(); });
if (params.logits_all) { if (params.logits_all) {
printf("\n************\n"); LOG_ERR("************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.embedding) { if (params.embedding) {
printf("\n************\n"); LOG_ERR("************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n"); LOG_ERR("************\n\n");
return 0; return 0;
} }
if (params.n_ctx != 0 && params.n_ctx < 8) { if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
} }
if (params.rope_freq_base != 0.0) { if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
} }
if (params.rope_freq_scale != 0.0) { if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
} }
print_build_info(); LOG_INF("%s: llama backend init\n", __func__);
LOG("%s: llama backend init\n", __func__);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -207,21 +192,19 @@ int main(int argc, char ** argv) {
g_smpl = &smpl; g_smpl = &smpl;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__); LOG_ERR("%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
LOG("%s: llama threadpool init = n_threads = %d\n", LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
__func__,
(int) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch = struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp = struct ggml_threadpool_params tpp =
@ -233,8 +216,8 @@ int main(int argc, char ** argv) {
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch); threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) { if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1); return 1;
} }
// Start the non-batch threadpool in the paused state // Start the non-batch threadpool in the paused state
@ -243,55 +226,54 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) { if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1); return 1;
} }
llama_attach_threadpool(ctx, threadpool, threadpool_batch); llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, n_ctx);
} }
// print chat template example in conversation mode // print chat template example in conversation mode
if (params.conversation) { if (params.conversation) {
if (params.enable_chat_template) { if (params.enable_chat_template) {
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
} else { } else {
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
} }
} }
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
LOG_INF("\n");
} }
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
if (!path_session.empty()) { if (!path_session.empty()) {
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
if (!file_exists(path_session)) { if (!file_exists(path_session)) {
LOG_TEE("%s: session file does not exist, will create.\n", __func__); LOG_INF("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) { } else if (file_is_empty(path_session)) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else { } else {
// The file exists and is not empty // The file exists and is not empty
session_tokens.resize(n_ctx); session_tokens.resize(n_ctx);
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
} }
} }
@ -299,7 +281,8 @@ int main(int argc, char ** argv) {
if (!llama_model_has_encoder(model)) { if (!llama_model_has_encoder(model)) {
GGML_ASSERT(!llama_add_eos_token(model)); GGML_ASSERT(!llama_add_eos_token(model));
} }
LOG("add_bos: %d\n", add_bos);
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
@ -308,31 +291,31 @@ int main(int argc, char ** argv) {
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt; : params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n"); LOG_DBG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true); embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else { } else {
LOG("use session tokens\n"); LOG_DBG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
} }
LOG("prompt: \"%s\"\n", log_tostr(prompt)); LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
} }
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
if (add_bos) { if (add_bos) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else { } else {
LOG_TEE("error: input is empty\n"); LOG_ERR("input is empty\n");
return -1; return -1;
} }
} }
// Tokenize negative prompt // Tokenize negative prompt
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
@ -346,29 +329,28 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++; n_matching_session_tokens++;
} }
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
LOG_TEE("%s: using full prompt from session file\n", __func__); LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) { } else if (n_matching_session_tokens >= embd_inp.size()) {
LOG_TEE("%s: session file has exact match for prompt!\n", __func__); LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) { } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} else { } else {
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} }
// remove any "future" tokens that we might have inherited from the previous session // remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
} }
LOGLN( LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu", embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits // reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1);
} }
@ -390,21 +372,20 @@ int main(int argc, char ** argv) {
} }
if (params.verbose_prompt) { if (params.verbose_prompt) {
LOG_TEE("\n"); LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (params.n_keep > add_bos) { if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__); LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
LOG_TEE("'\n"); LOG("'\n");
} }
LOG_TEE("\n"); LOG_INF("\n");
} }
// ctrl+C handling // ctrl+C handling
@ -424,40 +405,40 @@ int main(int argc, char ** argv) {
} }
if (params.interactive) { if (params.interactive) {
LOG_TEE("%s: interactive mode on.\n", __func__); LOG("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n"); LOG("Input prefix with BOS\n");
} }
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
} }
} }
@ -465,15 +446,15 @@ int main(int argc, char ** argv) {
smpl = gpt_sampler_init(model, sparams); smpl = gpt_sampler_init(model, sparams);
if (!smpl) { if (!smpl) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1); return 1;
} }
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state // group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1) // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -487,9 +468,9 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
} }
LOG_TEE("\n\n"); LOG("\n");
if (params.interactive) { if (params.interactive) {
const char * control_message; const char * control_message;
@ -501,11 +482,11 @@ int main(int argc, char ** argv) {
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
LOG_TEE("== Running in interactive mode. ==\n"); LOG("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); LOG( " - Press Ctrl+C to interject at any time.\n");
#endif #endif
LOG_TEE( "%s\n", control_message); LOG( "%s\n", control_message);
is_interacting = params.interactive_first; is_interacting = params.interactive_first;
} }
@ -544,7 +525,7 @@ int main(int argc, char ** argv) {
llama_token * enc_input_buf = embd_inp.data(); llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -570,9 +551,8 @@ int main(int argc, char ** argv) {
embd.resize(max_embd_size); embd.resize(max_embd_size);
console::set_display(console::error); console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset); console::set_display(console::reset);
fflush(stdout);
} }
if (ga_n == 1) { if (ga_n == 1) {
@ -580,29 +560,35 @@ int main(int argc, char ** argv) {
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() >= n_ctx) { if (n_past + (int) embd.size() >= n_ctx) {
if (params.n_predict == -2) { if (!params.ctx_shift){
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
break; break;
} else {
if (params.n_predict == -2) {
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG_DBG("clear session path\n");
path_session.clear();
} }
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("clear session path\n");
path_session.clear();
} }
} else { } else {
// context extension via Self-Extend // context extension via Self-Extend
@ -611,10 +597,10 @@ int main(int argc, char ** argv) {
const int bd = (ga_w/ga_n)*(ga_n - 1); const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w; const int dd = (ga_w/ga_n) - ib*bd - ga_w;
LOG("\n"); LOG_DBG("\n");
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
@ -624,7 +610,7 @@ int main(int argc, char ** argv) {
ga_i += ga_w/ga_n; ga_i += ga_w/ga_n;
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
} }
} }
@ -656,19 +642,19 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__); LOG_ERR("%s : failed to eval\n", __func__);
return 1; return 1;
} }
n_past += n_eval; n_past += n_eval;
LOG("n_past = %d\n", n_past); LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time // Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) { if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
} }
} }
@ -686,14 +672,14 @@ int main(int argc, char ** argv) {
need_to_save_session = false; need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str()); LOG_DBG("saved session to %s\n", path_session.c_str());
} }
const llama_token id = gpt_sampler_sample(smpl, ctx, -1); const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
gpt_sampler_accept(smpl, id, /* apply_grammar= */ true); gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id); embd.push_back(id);
@ -703,16 +689,16 @@ int main(int argc, char ** argv) {
// decrement remaining sampling budget // decrement remaining sampling budget
--n_remain; --n_remain;
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]); embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false); gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -727,7 +713,7 @@ int main(int argc, char ** argv) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special); const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output // Console/Stream Output
fprintf(stdout, "%s", token_str.c_str()); LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log // Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check // Note: Generated tokens are created one by one hence this check
@ -739,8 +725,6 @@ int main(int argc, char ** argv) {
output_tokens.push_back(id); output_tokens.push_back(id);
output_ss << token_str; output_ss << token_str;
} }
fflush(stdout);
} }
} }
@ -789,13 +773,13 @@ int main(int argc, char ** argv) {
} }
if (is_antiprompt) { if (is_antiprompt) {
LOG("found antiprompt: %s\n", last_output.c_str()); LOG_DBG("found antiprompt: %s\n", last_output.c_str());
} }
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG("found an EOG token\n"); LOG_DBG("found an EOG token\n");
if (params.interactive) { if (params.interactive) {
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
@ -809,7 +793,7 @@ int main(int argc, char ** argv) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
} }
is_interacting = true; is_interacting = true;
printf("\n"); LOG("\n");
} }
} }
@ -820,21 +804,21 @@ int main(int argc, char ** argv) {
} }
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n"); LOG_DBG("waiting for user input\n");
if (params.conversation) { if (params.conversation) {
printf("\n> "); LOG("\n> ");
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_token_bos(model));
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) { if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str()); LOG("%s", params.input_prefix.c_str());
} }
// color user input only // color user input only
@ -857,11 +841,11 @@ int main(int argc, char ** argv) {
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) { if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str()); LOG("%s", params.input_suffix.c_str());
} }
LOG("buffer: '%s'\n", buffer.c_str()); LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
@ -878,7 +862,7 @@ int main(int argc, char ** argv) {
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response // if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) { if (need_insert_eot && format_chat) {
@ -901,9 +885,9 @@ int main(int argc, char ** argv) {
assistant_ss.str(""); assistant_ss.str("");
n_remain -= line_inp.size(); n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain); LOG_DBG("n_remain: %d\n", n_remain);
} else { } else {
LOG("empty line, passing control back\n"); LOG_DBG("empty line, passing control back\n");
} }
input_echo = false; // do not echo this again input_echo = false; // do not echo this again
@ -919,7 +903,7 @@ int main(int argc, char ** argv) {
// end of generation // end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n"); LOG(" [end of text]\n");
break; break;
} }
@ -932,11 +916,11 @@ int main(int argc, char ** argv) {
} }
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
} }
LOG_TEE("\n"); LOG("\n\n");
gpt_perf_print(ctx, smpl); gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
@ -950,9 +934,5 @@ int main(int argc, char ** argv) {
ggml_threadpool_free(threadpool); ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch); ggml_threadpool_free(threadpool_batch);
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0; return 0;
} }

View file

@ -1,5 +1,6 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
@ -8,9 +9,9 @@
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -24,6 +25,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
int n_junk = params.n_junk; int n_junk = params.n_junk;
int n_keep = params.n_keep; int n_keep = params.n_keep;
int n_grp = params.grp_attn_n; int n_grp = params.grp_attn_n;
@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_ERR("%s: unable to load model\n" , __func__);
return 1; return 1;
} }
@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1; return 1;
} }
@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
const int n_batch = ctx_params.n_batch; const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp; const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
// print the prompt token-by-token // print the prompt token-by-token
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
LOG_TEE("prompt tokens: %d\n", n_tokens_all); LOG_INF("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str()); //LOG_INF("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(params.n_batch, 0, 1); llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_INF("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
if (i + n_batch >= n_tokens_all) { if (i + n_batch >= n_tokens_all) {
break; break;
@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
for (int i = n_ctx; i < n_tokens_all; i += n_batch) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch; const int n_discard = n_batch;
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
} }
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
} }
{ {
const int n_discard = n_past - n_ctx + n_predict; const int n_discard = n_past - n_ctx + n_predict;
if (n_discard > 0) { if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("\n"); LOG_INF("\n");
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
LOG_TEE("\n"); LOG_INF("\n");
// main loop // main loop
int n_cur = n_tokens_all; int n_cur = n_tokens_all;
int n_decode = 0; int n_decode = 0;
LOG_TEE("%s", prompt_suffix.c_str()); LOG_INF("%s", prompt_suffix.c_str());
fflush(stdout);
const auto t_main_start = ggml_time_us(); const auto t_main_start = ggml_time_us();
@ -222,13 +224,12 @@ int main(int argc, char ** argv) {
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
LOG_TEE("\n"); LOG("\n");
break; break;
} }
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
n_decode += 1; n_decode += 1;
@ -243,22 +244,22 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model // evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1; return 1;
} }
} }
LOG_TEE("\n"); LOG("\n");
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
fprintf(stderr, "\n"); LOG("\n");
llama_sampler_free(smpl); llama_sampler_free(smpl);

View file

@ -1,14 +1,16 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <iostream> // TODO: remove me
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG("\nexample usage:\n");
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
struct chunk { struct chunk {
@ -17,7 +19,7 @@ struct chunk {
// original file position // original file position
size_t filepos; size_t filepos;
// original text data // original text data
std::string textdata = ""; std::string textdata;
// tokenized text data // tokenized text data
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
// embedding // embedding
@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
std::ifstream f(filename.c_str()); std::ifstream f(filename.c_str());
if (!f.is_open()) { if (!f.is_open()) {
fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); LOG_ERR("could not open file %s\n", filename.c_str());
return chunks; return chunks;
} }
chunk current_chunk; chunk current_chunk;
char buffer[1024]; char buffer[1024];
int64_t filepos = 0; int64_t filepos = 0;
std::string current = ""; std::string current;
while (f.read(buffer, 1024)) { while (f.read(buffer, 1024)) {
current += std::string(buffer, f.gcount()); current += std::string(buffer, f.gcount());
size_t pos; size_t pos;
@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
// run model // run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) { if (llama_decode(ctx, batch) < 0) {
fprintf(stderr, "%s : failed to decode\n", __func__); LOG_ERR("%s : failed to decode\n", __func__);
} }
for (int i = 0; i < batch.n_tokens; i++) { for (int i = 0; i < batch.n_tokens; i++) {
@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
if (embd == NULL) { if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i); embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) { if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
continue; continue;
} }
} }
@ -116,24 +118,24 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
// For BERT models, batch size must be equal to ubatch size // For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch; params.n_ubatch = params.n_batch;
params.embedding = true; params.embedding = true;
if (params.chunk_size <= 0) { if (params.chunk_size <= 0) {
fprintf(stderr, "chunk_size must be positive\n"); LOG_ERR("chunk_size must be positive\n");
return 1; return 1;
} }
if (params.context_files.empty()) { if (params.context_files.empty()) {
fprintf(stderr, "context_files must be specified\n"); LOG_ERR("context_files must be specified\n");
return 1; return 1;
} }
print_build_info(); LOG_INF("processing files:\n");
printf("processing files:\n");
for (auto & context_file : params.context_files) { for (auto & context_file : params.context_files) {
printf("%s\n", context_file.c_str()); LOG_INF("%s\n", context_file.c_str());
} }
std::vector<chunk> chunks; std::vector<chunk> chunks;
@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
} }
printf("Number of chunks: %ld\n", chunks.size()); LOG_INF("Number of chunks: %ld\n", chunks.size());
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_init.context; llama_context * ctx = llama_init.context;
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_ERR("%s: unable to load model\n", __func__);
return 1; return 1;
} }
@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) { if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); LOG_ERR("%s: pooling type NONE not supported\n", __func__);
return 1; return 1;
} }
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_INF("\n");
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
} }
// max batch size // max batch size
@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
for (auto & chunk : chunks) { for (auto & chunk : chunks) {
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);
return 1; return 1;
} }
@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
// tokenization stats // tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
for (int i = 0; i < (int) chunks.size(); i++) { for (int i = 0; i < (int) chunks.size(); i++) {
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
} }
fprintf(stderr, "\n\n"); LOG_INF("\n\n");
} }
} }
@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
// start loop, receive query and return top k similar chunks based on cosine similarity // start loop, receive query and return top k similar chunks based on cosine similarity
std::string query; std::string query;
while (true) { while (true) {
printf("Enter query: "); LOG("Enter query: ");
std::getline(std::cin, query); std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true); std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
@ -280,18 +282,18 @@ int main(int argc, char ** argv) {
return a.second > b.second; return a.second > b.second;
}); });
printf("Top %d similar chunks:\n", params.sparams.top_k); LOG("Top %d similar chunks:\n", params.sparams.top_k);
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
printf("similarity: %f\n", similarities[i].second); LOG("similarity: %f\n", similarities[i].second);
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
printf("--------------------\n"); LOG("--------------------\n");
} }
} }
} }
LOG_TEE("\n"); LOG("\n");
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
// clean up // clean up

View file

@ -272,7 +272,6 @@ def start_server_background(args):
server_args.append('--cont-batching') server_args.append('--cont-batching')
server_args.append('--metrics') server_args.append('--metrics')
server_args.append('--flash-attn') server_args.append('--flash-attn')
server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]] args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")
pkwargs = { pkwargs = {

File diff suppressed because it is too large Load diff

1
examples/server/tests/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
.venv

View file

@ -1372,8 +1372,6 @@ def start_server_background(context):
server_args.append('--verbose') server_args.append('--verbose')
if context.lora_file: if context.lora_file:
server_args.extend(['--lora', context.lora_file]) server_args.extend(['--lora', context.lora_file])
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [context.server_path, *server_args]] args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")

View file

@ -1,7 +1,8 @@
#pragma once #pragma once
#include "llama.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h"
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@ -15,10 +16,10 @@
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
#include <random>
#include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream>
#include <random>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
@ -35,32 +36,6 @@ enum error_type {
ERROR_TYPE_NOT_SUPPORTED, // custom error ERROR_TYPE_NOT_SUPPORTED, // custom error
}; };
extern bool server_verbose;
extern bool server_log_json;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
template <typename T> template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) { static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value // Fallback null to default value
@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
try { try {
return body.at(key); return body.at(key);
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
std::stringstream ss; LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
LOG_WARNING(ss.str().c_str(), body);
return default_value; return default_value;
} }
} else { } else {
@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
} }
} }
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = json{
{"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
log.merge_patch({
{"level", level},
{"function", function},
{"line", line},
{"msg", message},
});
if (!extra.empty()) {
log.merge_patch(extra);
}
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
} else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
if (!extra.empty()) {
log.merge_patch(extra);
}
std::stringstream ss;
ss << buf << " |";
for (const auto & el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
ss << " " << el.key() << "=" << value;
}
const std::string str = ss.str();
printf("%.*s\n", (int)str.size(), str.data());
}
fflush(stdout);
}
// //
// chat template utils // chat template utils
// //
@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content}); chat.push_back({role, content});
} }
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat; return formatted_chat;
} }
@ -243,10 +175,7 @@ static std::string random_string() {
} }
static std::string gen_chatcmplid() { static std::string gen_chatcmplid() {
std::stringstream chatcmplid; return "chatcmpl-" + random_string();
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
} }
// //
@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
return std::string::npos; return std::string::npos;
} }
static bool json_is_array_of_numbers(json data) { static bool json_is_array_of_numbers(const json & data) {
if (data.is_array()) { if (data.is_array()) {
for (const auto & e : data) { for (const auto & e : data) {
if (!e.is_number()) { if (!e.is_number()) {
@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
return out; return out;
} }
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) { static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
const std::string str = const std::string str =
std::string(event) + ": " + std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) + data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n"; "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
LOG_VERBOSE("data stream", { LOG_DBG("data stream, to_send: %s", str.c_str());
{ "to_send", str }
});
return sink.write(str.c_str(), str.size()); return sink.write(str.c_str(), str.size());
} }
@ -404,6 +331,9 @@ static json oaicompat_completion_params_parse(
std::string response_type = json_value(response_format, "type", std::string()); std::string response_type = json_value(response_format, "type", std::string());
if (response_type == "json_object") { if (response_type == "json_object") {
llama_params["json_schema"] = json_value(response_format, "schema", json::object()); llama_params["json_schema"] = json_value(response_format, "schema", json::object());
} else if (response_type == "json_schema") {
json json_schema = json_value(response_format, "json_schema", json::object());
llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
} else if (!response_type.empty() && response_type != "text") { } else if (!response_type.empty() && response_type != "text") {
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
} }
@ -425,7 +355,7 @@ static json oaicompat_completion_params_parse(
// Params supported by OAI but unsupported by llama.cpp // Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" }; static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
for (auto & param : unsupported_params) { for (const auto & param : unsupported_params) {
if (body.contains(param)) { if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param); throw std::runtime_error("Unsupported param: " + param);
} }
@ -444,7 +374,7 @@ static json oaicompat_completion_params_parse(
return llama_params; return llama_params;
} }
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
bool stopped_word = result.count("stopped_word") != 0; bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false); bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0); int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@ -481,7 +411,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
{"id", completion_id} {"id", completion_id}
}; };
if (server_verbose) { // extra fields for debugging purposes
if (verbose) {
res["__verbose"] = result; res["__verbose"] = result;
} }
@ -493,7 +424,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
} }
// return value is vector as there is one case where we might need to generate two responses // return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) { static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result}); return std::vector<json>({result});
} }
@ -595,7 +526,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array(); json data = json::array();
int i = 0; int i = 0;
for (auto & elem : embeddings) { for (const auto & elem : embeddings) {
data.push_back(json{ data.push_back(json{
{"embedding", json_value(elem, "embedding", json::array())}, {"embedding", json_value(elem, "embedding", json::array())},
{"index", i++}, {"index", i++},

View file

@ -1,16 +1,14 @@
#include "arg.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "log.h"
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector> #include <vector>
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
LOG_TEE("\n"); LOG("\n");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -23,6 +21,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
gpt_init();
// total length of the sequence including the prompt // total length of the sequence including the prompt
const int n_predict = params.n_predict; const int n_predict = params.n_predict;
@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); LOG("\n");
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens // make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) { if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1; return 1;
} }
// print the prompt token-by-token // print the prompt token-by-token
fprintf(stderr, "\n"); LOG("\n");
for (auto id : tokens_list) { for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); LOG("%s", llama_token_to_piece(ctx, id).c_str());
} }
fflush(stderr);
// create a llama_batch with size 512 // create a llama_batch with size 512
// we use this object to submit token data for decoding // we use this object to submit token data for decoding
@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) { if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__); LOG("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -116,16 +115,16 @@ int main(int argc, char ** argv) {
while (n_cur <= n_predict) { while (n_cur <= n_predict) {
// sample the next token // sample the next token
{ {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
LOG_TEE("\n"); LOG("\n");
break; break;
} }
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout); fflush(stdout);
// prepare the next batch // prepare the next batch
@ -141,23 +140,23 @@ int main(int argc, char ** argv) {
// evaluate the current batch with the transformer model // evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) { if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1; return 1;
} }
} }
LOG_TEE("\n"); LOG("\n");
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
LOG_TEE("\n"); LOG("\n");
llama_perf_sampler_print(smpl); llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx); llama_perf_context_print(ctx);
fprintf(stderr, "\n"); LOG("\n");
llama_batch_free(batch); llama_batch_free(batch);
llama_sampler_free(smpl); llama_sampler_free(smpl);

View file

@ -11,16 +11,17 @@ source /opt/intel/oneapi/setvars.sh
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=llama-2-7b.Q4_0.gguf MODEL_FILE=models/llama-2-7b.Q4_0.gguf
NGL=33 NGL=33
CONEXT=8192
if [ $# -gt 0 ]; then if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU" echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only #use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
else else
#use multiple GPUs with same max compute units #use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
fi fi

View file

@ -1,11 +1,13 @@
#include "common.h" #include "common.h"
//#include "log.h" // TODO: start using log.h
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> // TODO: remove me
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
@ -13,25 +15,25 @@
#include <shellapi.h> // For CommandLineToArgvW #include <shellapi.h> // For CommandLineToArgvW
#endif #endif
static void print_usage_information(const char * argv0, FILE * stream) { static void print_usage_information(const char * argv0) {
fprintf(stream, "usage: %s [options]\n\n", argv0); printf("usage: %s [options]\n\n", argv0);
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); printf("The tokenize program tokenizes a prompt using a given model,\n");
fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); printf("and prints the resulting tokens to standard output.\n\n");
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); printf("It needs a model file, a prompt, and optionally other flags\n");
fprintf(stream, "to control the behavior of the tokenizer.\n\n"); printf("to control the behavior of the tokenizer.\n\n");
fprintf(stream, " The possible options are:\n"); printf(" The possible options are:\n");
fprintf(stream, "\n"); printf("\n");
fprintf(stream, " -h, --help print this help and exit\n"); printf(" -h, --help print this help and exit\n");
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n"); printf(" --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --no-parse-special do not parse control tokens.\n"); printf(" --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n"); printf(" --show-count print the total number of tokens.\n");
} }
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
const int argc = argv.size(); const int argc = argv.size();
if (argc <= 1) { if (argc <= 1) {
print_usage_information(argv[0].c_str(), stderr); print_usage_information(argv[0].c_str());
return 1; return 1;
} }
@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
for (; iarg < argc; ++iarg) { for (; iarg < argc; ++iarg) {
std::string arg{argv[iarg]}; std::string arg{argv[iarg]};
if (arg == "-h" || arg == "--help") { if (arg == "-h" || arg == "--help") {
print_usage_information(argv[0].c_str(), stdout); print_usage_information(argv[0].c_str());
return 0; return 0;
} }
else if (arg == "--ids") { else if (arg == "--ids") {
@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
// Start actually doing the tokenizing stuff. // Start actually doing the tokenizing stuff.
////// //////
#ifdef LOG_DISABLE_LOGS
disable_logging = true;
#endif
if (disable_logging) { if (disable_logging) {
llama_log_set(llama_log_callback_null, NULL); llama_log_set(llama_log_callback_null, NULL);
} }

View file

@ -570,10 +570,11 @@ extern "C" {
}; };
enum ggml_log_level { enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2, GGML_LOG_LEVEL_NONE = 0,
GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_INFO = 1,
GGML_LOG_LEVEL_INFO = 4, GGML_LOG_LEVEL_WARN = 2,
GGML_LOG_LEVEL_DEBUG = 5 GGML_LOG_LEVEL_ERROR = 3,
GGML_LOG_LEVEL_DEBUG = 4,
}; };
enum ggml_tensor_flag { enum ggml_tensor_flag {

View file

@ -4,6 +4,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>

614
ggml/src/ggml-cpu-impl.h Normal file
View file

@ -0,0 +1,614 @@
#pragma once
// GGML CPU internal header
#include "ggml.h"
#include "ggml-impl.h"
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
//#include <stddef.h>
#include <stdbool.h>
#include <string.h> // memcpy
#include <math.h> // fabsf
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_MSC_VER)
#define m512bh(p) p
#define m512i(p) p
#else
#define m512bh(p) (__m512bh)(p)
#define m512i(p) (__m512i)(p)
#endif
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
#endif
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#ifdef _MSC_VER
typedef uint16_t ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else
typedef __fp16 ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif // _MSC_VER
#if !defined(__aarch64__)
// 32-bit ARM compatibility
// vaddlvq_s16
// vpaddq_s16
// vpaddq_s32
// vaddvq_s32
// vaddvq_f32
// vmaxvq_f32
// vcvtnq_s32_f32
// vzip1_u8
// vzip2_u8
inline static int32_t vaddlvq_s16(int16x8_t v) {
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
return vcombine_s32(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
res[1] = roundf(vgetq_lane_f32(v, 1));
res[2] = roundf(vgetq_lane_f32(v, 2));
res[3] = roundf(vgetq_lane_f32(v, 3));
return res;
}
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
// vld1q_s16_x2
// vld1q_u8_x2
// vld1q_u8_x4
// vld1q_s8_x2
// vld1q_s8_x4
// TODO: double-check these work correctly
typedef struct ggml_int16x8x2_t {
int16x8_t val[2];
} ggml_int16x8x2_t;
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
ggml_int16x8x2_t res;
res.val[0] = vld1q_s16(ptr + 0);
res.val[1] = vld1q_s16(ptr + 8);
return res;
}
typedef struct ggml_uint8x16x2_t {
uint8x16_t val[2];
} ggml_uint8x16x2_t;
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
ggml_uint8x16x2_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
return res;
}
typedef struct ggml_uint8x16x4_t {
uint8x16_t val[4];
} ggml_uint8x16x4_t;
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
ggml_uint8x16x4_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
res.val[2] = vld1q_u8(ptr + 32);
res.val[3] = vld1q_u8(ptr + 48);
return res;
}
typedef struct ggml_int8x16x2_t {
int8x16_t val[2];
} ggml_int8x16x2_t;
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
ggml_int8x16x2_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
return res;
}
typedef struct ggml_int8x16x4_t {
int8x16_t val[4];
} ggml_int8x16x4_t;
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
ggml_int8x16x4_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
res.val[2] = vld1q_s8(ptr + 32);
res.val[3] = vld1q_s8(ptr + 48);
return res;
}
// NOTE: not tested
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
int8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
// NOTE: not tested
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
uint8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
#else
#define ggml_int16x8x2_t int16x8x2_t
#define ggml_uint8x16x2_t uint8x16x2_t
#define ggml_uint8x16x4_t uint8x16x4_t
#define ggml_int8x16x2_t int8x16x2_t
#define ggml_int8x16x4_t int8x16x4_t
#define ggml_vld1q_s16_x2 vld1q_s16_x2
#define ggml_vld1q_u8_x2 vld1q_u8_x2
#define ggml_vld1q_u8_x4 vld1q_u8_x4
#define ggml_vld1q_s8_x2 vld1q_s8_x2
#define ggml_vld1q_s8_x4 vld1q_s8_x4
#define ggml_vqtbl1q_s8 vqtbl1q_s8
#define ggml_vqtbl1q_u8 vqtbl1q_u8
#endif // !defined(__aarch64__)
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif // !defined(__ARM_FEATURE_DOTPROD)
#endif // defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(_MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#if defined(__loongarch64)
#if defined(__loongarch_asx)
#include <lasxintrin.h>
#endif
#if defined(__loongarch_sx)
#include <lsxintrin.h>
#endif
#endif
#if defined(__loongarch_asx)
typedef union {
int32_t i;
float f;
} ft_union;
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
}
static __m256 __lasx_xvreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
}
#endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
#ifdef __cplusplus
}
#endif

View file

@ -1,15 +1,17 @@
#pragma once #pragma once
#include "ggml.h"
// GGML internal header // GGML internal header
#include "ggml.h"
#include <assert.h> #include <assert.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
#include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#include <string.h> // memcpy #include <stdint.h>
#include <math.h> // fabsf
#ifdef __cplusplus
extern "C" {
#endif
#undef MIN #undef MIN
#undef MAX #undef MAX
@ -17,96 +19,6 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#if defined(_MSC_VER)
#define m512bh(p) p
#define m512i(p) p
#else
#define m512bh(p) (__m512bh)(p)
#define m512i(p) (__m512i)(p)
#endif
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*
* This is binary identical with Google Brain float conversion.
* Floats shall round to nearest even, and NANs shall be quiet.
* Subnormals aren't flushed to zero, except perhaps when used.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
#ifdef __cplusplus
extern "C" {
#endif
// static_assert should be a #define, but if it's not, // static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword. // fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop // if C99 - static_assert is noop
@ -121,520 +33,6 @@ extern "C" {
#endif #endif
#endif #endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
#endif
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
#ifdef _MSC_VER
typedef uint16_t ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
#else
typedef __fp16 ggml_fp16_internal_t;
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
#endif // _MSC_VER
#if !defined(__aarch64__)
// 32-bit ARM compatibility
// vaddlvq_s16
// vpaddq_s16
// vpaddq_s32
// vaddvq_s32
// vaddvq_f32
// vmaxvq_f32
// vcvtnq_s32_f32
// vzip1_u8
// vzip2_u8
inline static int32_t vaddlvq_s16(int16x8_t v) {
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
}
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
return vcombine_s16(a0, b0);
}
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
return vcombine_s32(a0, b0);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
res[1] = roundf(vgetq_lane_f32(v, 1));
res[2] = roundf(vgetq_lane_f32(v, 2));
res[3] = roundf(vgetq_lane_f32(v, 3));
return res;
}
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
// vld1q_s16_x2
// vld1q_u8_x2
// vld1q_u8_x4
// vld1q_s8_x2
// vld1q_s8_x4
// TODO: double-check these work correctly
typedef struct ggml_int16x8x2_t {
int16x8_t val[2];
} ggml_int16x8x2_t;
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
ggml_int16x8x2_t res;
res.val[0] = vld1q_s16(ptr + 0);
res.val[1] = vld1q_s16(ptr + 8);
return res;
}
typedef struct ggml_uint8x16x2_t {
uint8x16_t val[2];
} ggml_uint8x16x2_t;
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
ggml_uint8x16x2_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
return res;
}
typedef struct ggml_uint8x16x4_t {
uint8x16_t val[4];
} ggml_uint8x16x4_t;
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
ggml_uint8x16x4_t res;
res.val[0] = vld1q_u8(ptr + 0);
res.val[1] = vld1q_u8(ptr + 16);
res.val[2] = vld1q_u8(ptr + 32);
res.val[3] = vld1q_u8(ptr + 48);
return res;
}
typedef struct ggml_int8x16x2_t {
int8x16_t val[2];
} ggml_int8x16x2_t;
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
ggml_int8x16x2_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
return res;
}
typedef struct ggml_int8x16x4_t {
int8x16_t val[4];
} ggml_int8x16x4_t;
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
ggml_int8x16x4_t res;
res.val[0] = vld1q_s8(ptr + 0);
res.val[1] = vld1q_s8(ptr + 16);
res.val[2] = vld1q_s8(ptr + 32);
res.val[3] = vld1q_s8(ptr + 48);
return res;
}
// NOTE: not tested
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
int8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
// NOTE: not tested
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
uint8x16_t res;
res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];
return res;
}
#else
#define ggml_int16x8x2_t int16x8x2_t
#define ggml_uint8x16x2_t uint8x16x2_t
#define ggml_uint8x16x4_t uint8x16x4_t
#define ggml_int8x16x2_t int8x16x2_t
#define ggml_int8x16x4_t int8x16x4_t
#define ggml_vld1q_s16_x2 vld1q_s16_x2
#define ggml_vld1q_u8_x2 vld1q_u8_x2
#define ggml_vld1q_u8_x4 vld1q_u8_x4
#define ggml_vld1q_s8_x2 vld1q_s8_x2
#define ggml_vld1q_s8_x4 vld1q_s8_x4
#define ggml_vqtbl1q_s8 vqtbl1q_s8
#define ggml_vqtbl1q_u8 vqtbl1q_u8
#endif // !defined(__aarch64__)
#if !defined(__ARM_FEATURE_DOTPROD)
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
}
#else
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
#endif // !defined(__ARM_FEATURE_DOTPROD)
#endif // defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(_MSC_VER)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#if defined(__loongarch64)
#if defined(__loongarch_asx)
#include <lasxintrin.h>
#endif
#if defined(__loongarch_sx)
#include <lsxintrin.h>
#endif
#endif
#if defined(__loongarch_asx)
typedef union {
int32_t i;
float f;
} ft_union;
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
}
static __m256 __lasx_xvreplfr2vr_s(float val) {
ft_union fi_tmpval = {.f = val};
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
}
#endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
// bitset // bitset
typedef uint32_t ggml_bitset_t; typedef uint32_t ggml_bitset_t;
@ -761,6 +159,12 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
// computation graph // computation graph
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
struct ggml_cgraph { struct ggml_cgraph {
int size; int size;
int n_nodes; int n_nodes;

View file

@ -13,13 +13,16 @@
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef GGML_METAL_NDEBUG #ifdef GGML_METAL_NDEBUG
#define GGML_METAL_LOG(...)
#define GGML_METAL_LOG_INFO(...) #define GGML_METAL_LOG_INFO(...)
#define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_WARN(...)
#define GGML_METAL_LOG_ERROR(...) #define GGML_METAL_LOG_ERROR(...)
#else #else
#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#endif #endif
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
#ifndef GGML_METAL_NDEBUG #ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) { if (@available(macOS 10.12, iOS 16.0, *)) {
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
__func__, __func__,
size_aligned / 1024.0 / 1024.0, size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0,
@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
} else {
GGML_METAL_LOG_INFO("\n");
} }
} else { } else {
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
@ -3224,15 +3225,19 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
ctx->n_buffers = 1; ctx->n_buffers = 1;
if (ctx->all_data != NULL) { if (ctx->all_data != NULL) {
ctx->buffers[0].data = ctx->all_data; ctx->buffers[0].data = ctx->all_data;
ctx->buffers[0].size = size; ctx->buffers[0].size = size;
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data ctx->buffers[0].metal = nil;
length:size_aligned
options:MTLResourceStorageModeShared if (size_aligned > 0) {
deallocator:nil]; ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
}
} }
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx); free(ctx);
ggml_backend_metal_free_device(); ggml_backend_metal_free_device();
@ -3309,14 +3314,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
// the buffer fits into the max buffer size allowed by the device // the buffer fits into the max buffer size allowed by the device
if (size_aligned <= device.maxBufferLength) { if (size_aligned <= device.maxBufferLength) {
ctx->buffers[ctx->n_buffers].data = data; ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size; ctx->buffers[ctx->n_buffers].size = size;
ctx->buffers[ctx->n_buffers].metal = nil;
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (size_aligned > 0) {
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
return false; return false;
}
} }
ggml_backend_metal_log_allocated_size(device, size_aligned); ggml_backend_metal_log_allocated_size(device, size_aligned);
@ -3332,14 +3340,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
for (size_t i = 0; i < size; i += size_step) { for (size_t i = 0; i < size; i += size_step) {
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
ctx->buffers[ctx->n_buffers].size = size_step_aligned; ctx->buffers[ctx->n_buffers].size = size_step_aligned;
ctx->buffers[ctx->n_buffers].metal = nil;
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (size_step_aligned > 0) {
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
return false; return false;
}
} }
ggml_backend_metal_log_allocated_size(device, size_step_aligned); ggml_backend_metal_log_allocated_size(device, size_step_aligned);

View file

@ -3,6 +3,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include <math.h> #include <math.h>
@ -231,6 +232,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
return _mm_packus_epi16( bytes1, bytes2); return _mm_packus_epi16( bytes1, bytes2);
} }
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
const __m128i ax = _mm_sign_epi8(x, x);
const __m128i sy = _mm_sign_epi8(y, x);
return _mm_maddubs_epi16(ax, sy);
}
#endif #endif
#elif defined(__SSSE3__) #elif defined(__SSSE3__)
// horizontally add 4x4 floats // horizontally add 4x4 floats
@ -4207,37 +4214,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_8(acc); sumf = hsum_float_8(acc);
#elif defined(__AVX__) #elif defined(__AVX__)
// Initialize accumulator with zeros const __m128i mone = _mm_set1_epi16(1);
__m256 acc = _mm256_setzero_ps();
// Main loop __m256 accum1 = _mm256_setzero_ps();
for (; ib < nb; ++ib) { __m256 accum2 = _mm256_setzero_ps();
// Compute combined scale for the block for (; ib + 1 < nb; ib += 2) {
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
const __m128i off = _mm_set1_epi8(8); const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs); const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
__m128i bx_0 = _mm_and_si128(lowMask, tmp); const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
bx_0 = _mm_sub_epi8(bx_0, off); const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
bx_0 = _mm_sub_epi8(bx_0, off); accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
// Convert int32_t to float _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
// Apply the scale, and accumulate
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
} }
sumf = hsum_float_8(acc); sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
#elif defined(__SSSE3__) #elif defined(__SSSE3__)
// set constants // set constants
const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i lowMask = _mm_set1_epi8(0xF);
@ -11820,15 +11827,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
#endif #endif
} }
#if defined(__AVX__)
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
const __m128i ax = _mm_sign_epi8(x, x);
const __m128i sy = _mm_sign_epi8(y, x);
return _mm_maddubs_epi16(ax, sy);
}
#endif
#if defined(__AVX2__) #if defined(__AVX2__)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i ax = _mm256_sign_epi8(x, x); const __m256i ax = _mm256_sign_epi8(x, x);

View file

@ -2,6 +2,7 @@
#define _USE_MATH_DEFINES // For M_PI on MSVC #define _USE_MATH_DEFINES // For M_PI on MSVC
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-aarch64.h" #include "ggml-aarch64.h"
@ -2020,10 +2021,11 @@ struct ggml_threadpool {
// these are atomic as an annotation for thread-sanitizer // these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether atomic_bool stop; // Used for stopping the threadpool altogether
atomic_bool pause; // Used for pausing the threadpool or individual threads atomic_bool pause; // Used for pausing the threadpool or individual threads
atomic_bool abort; // Used for aborting processing of a graph
struct ggml_compute_state * workers; // per thread state struct ggml_compute_state * workers; // per thread state
int n_threads_max; // number of threads in the pool int n_threads_max; // number of threads in the pool
int n_threads_cur; // number of threads used in the current graph atomic_int n_threads_cur; // number of threads used in the current graph
int32_t prio; // Scheduling priority int32_t prio; // Scheduling priority
uint32_t poll; // Polling level (0 - no polling) uint32_t poll; // Polling level (0 - no polling)
@ -3194,41 +3196,36 @@ inline static void ggml_critical_section_start(void) {
} }
} }
static void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
if (n_threads == 1) {
return;
}
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
static void ggml_barrier(struct ggml_threadpool * threadpool) {
if (threadpool->n_threads_cur == 1) {
return;
}
#pragma omp barrier #pragma omp barrier
}
#else #else
static void ggml_barrier(struct ggml_threadpool * threadpool) { int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
if (threadpool->n_threads_cur == 1) {
return;
}
atomic_int * n_barrier = &threadpool->n_barrier; // enter barrier (full seq-cst fence)
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
int n_threads = threadpool->n_threads_cur; int last = 0;
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed); if (n_barrier == (n_threads - 1)) {
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
// last thread // last thread
atomic_store(n_barrier, 0); atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed); last = 1;
} else { } else {
// wait for other threads // wait for other threads
while (true) { while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
return;
}
ggml_thread_cpu_relax(); ggml_thread_cpu_relax();
} }
} }
}
// exit barrier (full seq-cst fence)
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
#endif #endif
}
// TODO: make this somehow automatically executed // TODO: make this somehow automatically executed
// some sort of "sentry" mechanism // some sort of "sentry" mechanism
@ -19991,34 +19988,33 @@ struct ggml_cplan ggml_graph_plan(
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_threadpool * tp = state->threadpool;
const struct ggml_cgraph * cgraph = state->threadpool->cgraph; const struct ggml_cgraph * cgraph = tp->cgraph;
const struct ggml_cplan * cplan = state->threadpool->cplan; const struct ggml_cplan * cplan = tp->cplan;
set_numa_thread_affinity(state->ith); set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = { struct ggml_compute_params params = {
/*.ith =*/ state->ith, /*.ith =*/ state->ith,
/*.nth =*/ state->threadpool->n_threads_cur, /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
/*.wsize =*/ cplan->work_size, /*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data, /*.wdata =*/ cplan->work_data,
/*.threadpool=*/ state->threadpool, /*.threadpool=*/ tp,
}; };
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (state->ith == 0 && cplan->abort_callback &&
state->threadpool->ec = GGML_STATUS_ABORTED; cplan->abort_callback(cplan->abort_callback_data)) {
tp->abort = true;
tp->ec = GGML_STATUS_ABORTED;
} }
ggml_barrier(state->threadpool); ggml_barrier(state->threadpool);
if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
break;
}
} }
return 0; return 0;
@ -20026,7 +20022,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
#ifndef GGML_USE_OPENMP #ifndef GGML_USE_OPENMP
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { // check if thread is active
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
return (state->ith < n_threads);
}
// check if thread is ready to proceed (exit from polling or sleeping)
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool; struct ggml_threadpool * threadpool = state->threadpool;
if (state->pending || threadpool->stop || threadpool->pause) { return true; } if (state->pending || threadpool->stop || threadpool->pause) { return true; }
@ -20034,21 +20038,34 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
// check for new graph/work // check for new graph/work
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
if (new_graph != state->last_graph) { if (new_graph != state->last_graph) {
state->pending = (state->ith < threadpool->n_threads_cur); state->pending = ggml_graph_compute_thread_active(state);
state->last_graph = new_graph; state->last_graph = new_graph;
} }
return state->pending; return state->pending;
} }
// sync thread state after polling
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
// so instead we just use a dummy read-modify-write
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
}
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool; struct ggml_threadpool * threadpool = state->threadpool;
// Skip polling for unused threads
if (!ggml_graph_compute_thread_active(state)) {
return state->pending;
}
// This seems to make 0 ... 100 a decent range for polling level across modern processors. // This seems to make 0 ... 100 a decent range for polling level across modern processors.
// Perhaps, we can adjust it dynamically based on load and things. // Perhaps, we can adjust it dynamically based on load and things.
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) { for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
// No new work. Keep polling. // No new work. Keep polling.
ggml_thread_cpu_relax(); ggml_thread_cpu_relax();
} }
@ -20060,13 +20077,14 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
struct ggml_threadpool * threadpool = state->threadpool; struct ggml_threadpool * threadpool = state->threadpool;
if (ggml_graph_compute_poll_for_work(state)) { if (ggml_graph_compute_poll_for_work(state)) {
ggml_graph_compute_thread_sync(state);
return state->pending; return state->pending;
} }
ggml_mutex_lock_shared(&threadpool->mutex); ggml_mutex_lock_shared(&threadpool->mutex);
while (!ggml_graph_compute_ready(state)) { while (!ggml_graph_compute_thread_ready(state)) {
// No new work. Wait for the signal. // No new work. Wait for the signal.
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith); GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
ggml_cond_wait(&threadpool->cond, &threadpool->mutex); ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
} }
ggml_mutex_unlock_shared(&threadpool->mutex); ggml_mutex_unlock_shared(&threadpool->mutex);
@ -20113,13 +20131,20 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
} }
// Start processing new graph // Start processing new graph
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool) static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
{ {
// always take the mutex here because the worker threads are doing hybrid poll/wait // Always take the mutex here because the worker threads are doing hybrid poll/wait
ggml_mutex_lock(&threadpool->mutex); ggml_mutex_lock(&threadpool->mutex);
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed); GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
// Update the number of active threads
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
// Indicate the graph is ready to be processed
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
if (threadpool->pause) { if (threadpool->pause) {
// Update main thread prio and affinity to match the threadpool settings // Update main thread prio and affinity to match the threadpool settings
@ -20178,6 +20203,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->stop = false; threadpool->stop = false;
threadpool->pause = tpp->paused; threadpool->pause = tpp->paused;
threadpool->abort = false;
threadpool->workers = NULL; threadpool->workers = NULL;
threadpool->n_threads_max = tpp->n_threads; threadpool->n_threads_max = tpp->n_threads;
threadpool->n_threads_cur = tpp->n_threads; threadpool->n_threads_cur = tpp->n_threads;
@ -20253,15 +20279,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
// No worker threads should be accessing the parameters below at this stage // No worker threads should be accessing the parameters below at this stage
threadpool->cgraph = cgraph; threadpool->cgraph = cgraph;
threadpool->cplan = cplan; threadpool->cplan = cplan;
threadpool->n_threads_cur = n_threads;
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->abort = false;
threadpool->ec = GGML_STATUS_SUCCESS; threadpool->ec = GGML_STATUS_SUCCESS;
} }
if (n_threads > threadpool->n_threads_max) {
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
}
#ifdef GGML_USE_OPENMP #ifdef GGML_USE_OPENMP
if (n_threads > 1) { if (n_threads > 1) {
#pragma omp parallel num_threads(n_threads) #pragma omp parallel num_threads(n_threads)
@ -20270,17 +20292,23 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
{ {
// update the number of threads from the actual number of threads that we got from OpenMP // update the number of threads from the actual number of threads that we got from OpenMP
n_threads = omp_get_num_threads(); n_threads = omp_get_num_threads();
threadpool->n_threads_cur = n_threads; atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
} }
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
} }
} else { } else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
ggml_graph_compute_thread(&threadpool->workers[0]); ggml_graph_compute_thread(&threadpool->workers[0]);
} }
#else #else
if (n_threads > threadpool->n_threads_max) {
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
n_threads = threadpool->n_threads_max;
}
// Kick all threads to start the new graph // Kick all threads to start the new graph
ggml_graph_compute_kickoff(threadpool); ggml_graph_compute_kickoff(threadpool, n_threads);
// This is a work thread too // This is a work thread too
ggml_graph_compute_thread(&threadpool->workers[0]); ggml_graph_compute_thread(&threadpool->workers[0]);

View file

@ -50,6 +50,7 @@
#include "sgemm.h" #include "sgemm.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#ifdef _MSC_VER #ifdef _MSC_VER
@ -235,6 +236,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) {
} }
#endif // __AVX512F__ #endif // __AVX512F__
////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION // FLOATING POINT MATRIX MULTIPLICATION
@ -933,6 +942,20 @@ class tinyBLAS_Q0_AVX {
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
} }
inline __m256i load(const block_iq4_nl *b) {
return MM256_SET_M128I(load1(b), load0(b));
}
inline __m128i load0(const block_iq4_nl *b) {
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
}
inline __m128i load1(const block_iq4_nl *b) {
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
}
inline __m256 updot(__m256i u, __m256i s) { inline __m256 updot(__m256i u, __m256i s) {
__m256i res; __m256i res;
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@ -1159,6 +1182,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif #endif
} }
case GGML_TYPE_IQ4_NL: {
if (Btype != GGML_TYPE_Q8_0)
return false;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#else
return false;
#endif
}
default: default:
return false; return false;
} }

View file

@ -97,6 +97,8 @@ class Keys:
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
RESIDUAL_SCALE = "{arch}.residual_scale"
EMBEDDING_SCALE = "{arch}.embedding_scale"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
@ -112,6 +114,7 @@ class Keys:
KV_LORA_RANK = "{arch}.attention.kv_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
SLIDING_WINDOW = "{arch}.attention.sliding_window" SLIDING_WINDOW = "{arch}.attention.sliding_window"
SCALE = "{arch}.attention.scale"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
@ -210,6 +213,7 @@ class MODEL_ARCH(IntEnum):
ORION = auto() ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto() MINICPM = auto()
MINICPM3 = auto()
GEMMA = auto() GEMMA = auto()
GEMMA2 = auto() GEMMA2 = auto()
STARCODER2 = auto() STARCODER2 = auto()
@ -219,6 +223,7 @@ class MODEL_ARCH(IntEnum):
COMMAND_R = auto() COMMAND_R = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
OLMOE = auto()
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
@ -229,6 +234,7 @@ class MODEL_ARCH(IntEnum):
JAIS = auto() JAIS = auto()
NEMOTRON = auto() NEMOTRON = auto()
EXAONE = auto() EXAONE = auto()
GRANITE = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -364,6 +370,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
@ -373,6 +380,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
@ -383,6 +391,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.JAIS: "jais", MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -867,6 +876,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_UP_EXP,
], ],
MODEL_ARCH.MINICPM3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_A,
MODEL_TENSOR.ATTN_Q_B,
MODEL_TENSOR.ATTN_KV_A_MQA,
MODEL_TENSOR.ATTN_KV_B,
MODEL_TENSOR.ATTN_Q_A_NORM,
MODEL_TENSOR.ATTN_KV_A_NORM,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.GEMMA: [ MODEL_ARCH.GEMMA: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1008,6 +1034,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.OLMOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
],
MODEL_ARCH.OPENELM: [ MODEL_ARCH.OPENELM: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
@ -1186,6 +1229,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.GRANITE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
# TODO # TODO
} }

View file

@ -679,6 +679,12 @@ class GGUFWriter:
def add_time_decay_extra_dim(self, dim: int) -> None: def add_time_decay_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
def add_residual_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
def add_embedding_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
def add_wkv_head_size(self, size: int) -> None: def add_wkv_head_size(self, size: int) -> None:
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
@ -703,6 +709,9 @@ class GGUFWriter:
def add_sliding_window(self, value: int) -> None: def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

View file

@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron "model.embed_tokens", # llama-hf nemotron olmoe
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
@ -54,7 +54,7 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
@ -66,7 +66,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone "transformer.ln_f", # gpt2 gpt-j falcon jais exaone
"model.norm", # llama-hf baichuan internlm2 "model.norm", # llama-hf baichuan internlm2 olmoe
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
@ -98,7 +98,7 @@ class TensorNameMap:
"transformer.h.{bid}.input_layernorm", # falcon7b "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom "h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf nemotron "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
@ -142,7 +142,7 @@ class TensorNameMap:
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
@ -154,7 +154,7 @@ class TensorNameMap:
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
@ -167,7 +167,7 @@ class TensorNameMap:
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
@ -185,7 +185,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
@ -229,7 +229,7 @@ class TensorNameMap:
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
"h.{bid}.post_attention_layernorm", # bloom "h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
@ -253,7 +253,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"model.layers.{bid}.mlp.gate", # qwen2moe "model.layers.{bid}.mlp.gate", # qwen2moe olmoe
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
), ),
@ -295,7 +295,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
), ),
MODEL_TENSOR.FFN_UP_SHEXP: ( MODEL_TENSOR.FFN_UP_SHEXP: (
@ -327,7 +327,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
), ),
MODEL_TENSOR.FFN_GATE_SHEXP: ( MODEL_TENSOR.FFN_GATE_SHEXP: (
@ -367,7 +367,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
), ),
MODEL_TENSOR.FFN_DOWN_SHEXP: ( MODEL_TENSOR.FFN_DOWN_SHEXP: (
@ -378,7 +378,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere "model.layers.{bid}.self_attn.q_norm", # cohere olmoe
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm "transformer.layers.{bid}.attn.q_norm", # openelm
@ -387,7 +387,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere "model.layers.{bid}.self_attn.k_norm", # cohere olmoe
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm "transformer.layers.{bid}.attn.k_norm", # openelm

View file

@ -441,6 +441,7 @@ extern "C" {
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

View file

@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)

View file

@ -236,9 +236,10 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_n_vocab(llama_get_model(ctx));
// TODO: do not allocate each time // TODO: do not allocate each time
std::vector<llama_token_data> cur(n_vocab); std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
} }
llama_token_data_array cur_p = { llama_token_data_array cur_p = {

View file

@ -203,6 +203,7 @@ enum llm_arch {
LLM_ARCH_ORION, LLM_ARCH_ORION,
LLM_ARCH_INTERNLM2, LLM_ARCH_INTERNLM2,
LLM_ARCH_MINICPM, LLM_ARCH_MINICPM,
LLM_ARCH_MINICPM3,
LLM_ARCH_GEMMA, LLM_ARCH_GEMMA,
LLM_ARCH_GEMMA2, LLM_ARCH_GEMMA2,
LLM_ARCH_STARCODER2, LLM_ARCH_STARCODER2,
@ -211,6 +212,7 @@ enum llm_arch {
LLM_ARCH_COMMAND_R, LLM_ARCH_COMMAND_R,
LLM_ARCH_DBRX, LLM_ARCH_DBRX,
LLM_ARCH_OLMO, LLM_ARCH_OLMO,
LLM_ARCH_OLMOE,
LLM_ARCH_OPENELM, LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2,
@ -222,6 +224,7 @@ enum llm_arch {
LLM_ARCH_NEMOTRON, LLM_ARCH_NEMOTRON,
LLM_ARCH_EXAONE, LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6, LLM_ARCH_RWKV6,
LLM_ARCH_GRANITE,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -251,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_ORION, "orion" }, { LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_MINICPM3, "minicpm3" },
{ LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_STARCODER2, "starcoder2" },
@ -259,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
@ -270,6 +275,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
@ -309,6 +315,8 @@ enum llm_kv {
LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_RESCALE_EVERY_N_LAYERS,
LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_MIX_EXTRA_DIM,
LLM_KV_TIME_DECAY_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM,
LLM_KV_RESIDUAL_SCALE,
LLM_KV_EMBEDDING_SCALE,
LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -323,6 +331,7 @@ enum llm_kv {
LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE,
@ -413,6 +422,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@ -427,6 +438,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@ -1044,6 +1056,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
}, },
}, },
{
LLM_ARCH_MINICPM3,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
{ {
LLM_ARCH_GEMMA, LLM_ARCH_GEMMA,
{ {
@ -1178,6 +1213,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_OLMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{ {
LLM_ARCH_OPENELM, LLM_ARCH_OPENELM,
{ {
@ -1417,6 +1472,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
}, },
}, },
{
LLM_ARCH_GRANITE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -2266,6 +2337,7 @@ enum e_model {
MODEL_MEDIUM, MODEL_MEDIUM,
MODEL_LARGE, MODEL_LARGE,
MODEL_XL, MODEL_XL,
MODEL_A1_7B,
MODEL_A2_7B, MODEL_A2_7B,
MODEL_8x7B, MODEL_8x7B,
MODEL_8x22B, MODEL_8x22B,
@ -2338,6 +2410,11 @@ struct llama_hparams {
float f_max_alibi_bias = 0.0f; float f_max_alibi_bias = 0.0f;
float f_logit_scale = 0.0f; float f_logit_scale = 0.0f;
// Additional scale factors (Granite)
float f_residual_scale = 0.0f;
float f_embedding_scale = 0.0f;
float f_attention_scale = 0.0f;
bool causal_attn = true; bool causal_attn = true;
bool use_alibi = false; bool use_alibi = false;
bool attn_soft_cap = false; bool attn_soft_cap = false;
@ -2400,6 +2477,9 @@ struct llama_hparams {
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
return false; return false;
} }
@ -5248,6 +5328,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_MEDIUM: return "0.4B"; case MODEL_MEDIUM: return "0.4B";
case MODEL_LARGE: return "0.8B"; case MODEL_LARGE: return "0.8B";
case MODEL_XL: return "1.5B"; case MODEL_XL: return "1.5B";
case MODEL_A1_7B: return "A1.7B";
case MODEL_A2_7B: return "A2.7B"; case MODEL_A2_7B: return "A2.7B";
case MODEL_8x7B: return "8x7B"; case MODEL_8x7B: return "8x7B";
case MODEL_8x22B: return "8x22B"; case MODEL_8x22B: return "8x22B";
@ -5422,6 +5503,17 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_MINICPM3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
switch (hparams.n_layer) {
case 62: model.type = e_model::MODEL_4B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_GROK: case LLM_ARCH_GROK:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5787,6 +5879,14 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_OLMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 16: model.type = e_model::MODEL_A1_7B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_OPENELM: case LLM_ARCH_OPENELM:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5983,6 +6083,20 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_GRANITE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
switch (hparams.n_layer) {
case 40: model.type = e_model::MODEL_3B; break;
// Add additional layer/vocab/etc checks here for other model sizes
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0; default: (void)0;
} }
@ -6025,8 +6139,15 @@ static void llm_load_vocab(
vocab.special_mask_id = -1; vocab.special_mask_id = -1;
vocab.linefeed_id = -1; vocab.linefeed_id = -1;
// read vocab size from metadata
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
vocab.n_vocab = 0;
LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
}
return; return;
} else if (tokenizer_model == "llama") { }
if (tokenizer_model == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens // default special tokens
@ -6699,6 +6820,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
} }
if (model.arch == LLM_ARCH_GRANITE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
}
} }
// Returns false if cancelled by progress_callback // Returns false if cancelled by progress_callback
@ -6876,6 +7003,7 @@ static bool llm_load_tensors(
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT: case LLM_ARCH_REFACT:
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -6956,6 +7084,54 @@ static bool llm_load_tensors(
} }
} }
} break; } break;
case LLM_ARCH_MINICPM3:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
} break;
case LLM_ARCH_GROK: case LLM_ARCH_GROK:
{ {
if (n_expert == 0) { if (n_expert == 0) {
@ -7993,6 +8169,44 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
} }
} break; } break;
case LLM_ARCH_OLMOE:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
GGML_ASSERT(n_expert > 0);
GGML_ASSERT(n_expert_used > 0);
// MoE branch
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
}
} break;
case LLM_ARCH_OPENELM: case LLM_ARCH_OPENELM:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -8773,6 +8987,11 @@ static struct ggml_tensor * llm_build_inp_embd(
ggml_set_input(lctx.inp_embd); ggml_set_input(lctx.inp_embd);
} }
// For Granite architecture
if (hparams.f_embedding_scale != 0.0f) {
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
}
cb(inpL, "inp_embd", -1); cb(inpL, "inp_embd", -1);
return inpL; return inpL;
@ -9476,7 +9695,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
struct ggml_tensor * cur, struct ggml_tensor * cur,
struct ggml_tensor * x_prev, struct ggml_tensor * x_prev,
struct ggml_tensor ** wkv_state) { struct ggml_tensor ** wkv_state) {
size_t n_embed = cur->ne[0]; size_t n_embd = cur->ne[0];
size_t n_seq_tokens = cur->ne[1]; size_t n_seq_tokens = cur->ne[1];
size_t n_seqs = cur->ne[2]; size_t n_seqs = cur->ne[2];
@ -9487,8 +9706,8 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
@ -9513,11 +9732,11 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
xxx xxx
); );
struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float)); struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float)); struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
struct ggml_tensor * xw = ggml_add( struct ggml_tensor * xw = ggml_add(
ctx, ctx,
@ -9586,7 +9805,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
) )
); );
w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
@ -9595,21 +9814,21 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
r = ggml_transpose(ctx, r); r = ggml_transpose(ctx, r);
struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
// group norm with head_count groups // group norm with head_count groups
cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
cur = ggml_norm(ctx, cur, 64e-5f); cur = ggml_norm(ctx, cur, 64e-5f);
// Convert back to regular vectors. // Convert back to regular vectors.
cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
cur = ggml_mul(ctx, cur, g); cur = ggml_mul(ctx, cur, g);
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
} }
static struct ggml_tensor * llm_build_rwkv6_channel_mix( static struct ggml_tensor * llm_build_rwkv6_channel_mix(
@ -10051,6 +10270,7 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -10107,7 +10327,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, lctx, kv_self, gf, cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10118,6 +10338,11 @@ struct llm_build_context {
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
} }
// For Granite architecture
if (hparams.f_residual_scale) {
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
@ -10154,6 +10379,11 @@ struct llm_build_context {
cb(cur, "ffn_moe_out", il); cb(cur, "ffn_moe_out", il);
} }
// For Granite architecture
if (hparams.f_residual_scale) {
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
}
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10173,6 +10403,12 @@ struct llm_build_context {
// lm_head // lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
// For Granite architecture
if (hparams.f_logit_scale) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
}
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -12920,6 +13156,215 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
struct ggml_tensor * rope_factors = build_rope_factors(il);
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self_attention
{
struct ggml_tensor * q = NULL;
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);
q = llm_build_norm(ctx0, q, hparams,
model.layers[il].attn_q_a_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(q, "q", il);
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
// split into {kv_lora_rank, n_tokens}
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
kv_pe_compresseed->nb[1],
0);
cb(kv_compressed, "kv_compressed", il);
// and {n_embd_head_qk_rope, n_tokens}
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
kv_pe_compresseed->nb[1],
kv_pe_compresseed->nb[1],
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
cb(k_pe, "k_pe", il);
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
model.layers[il].attn_kv_a_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(kv_compressed, "kv_compressed", il);
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
cb(kv, "kv", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);
v_states = ggml_cont(ctx0, v_states);
cb(v_states, "v_states", il);
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
0);
cb(v_states, "v_states", il);
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
q_pe = ggml_rope_ext(
ctx0, q_pe, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(q_pe, "q_pe", il);
// shared RoPE key
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
k_pe = ggml_rope_ext(
ctx0, k_pe, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(k_pe, "k_pe", il);
struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
cb(q_states, "q_states", il);
struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
cb(k_states, "k_states", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, NULL,
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth/sqrtf(float(n_layer));
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", il);
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
// scale the hidden states for residual connection
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled_ffn", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head scaling
const float scale_lmhead = float(n_embd_base)/float(n_embd);
cur = ggml_scale(ctx0, cur, scale_lmhead);
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_gemma() { struct ggml_cgraph * build_gemma() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@ -13616,6 +14061,134 @@ struct llm_build_context {
return gf; return gf;
} }
// based on the build_qwen2moe() function, changes:
// * removed shared experts
// * removed bias
// * added q, k norm
struct ggml_cgraph * build_olmoe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self_attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur_normed", il);
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(Kcur, "Kcur_normed", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur_rope", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
n_tokens = n_outputs;
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// MoE branch
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_moe_ffn(ctx0, lctx, cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
cb, il);
cb(cur, "ffn_moe_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_openelm() { struct ggml_cgraph * build_openelm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@ -15375,6 +15948,7 @@ static struct ggml_cgraph * llama_build_graph(
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_GRANITE:
{ {
result = llm.build_llama(); result = llm.build_llama();
} break; } break;
@ -15460,6 +16034,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_minicpm(); result = llm.build_minicpm();
} break; } break;
case LLM_ARCH_MINICPM3:
{
result = llm.build_minicpm3();
} break;
case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA:
{ {
result = llm.build_gemma(); result = llm.build_gemma();
@ -15492,6 +16070,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_olmo(); result = llm.build_olmo();
} break; } break;
case LLM_ARCH_OLMOE:
{
result = llm.build_olmoe();
} break;
case LLM_ARCH_OPENELM: case LLM_ARCH_OPENELM:
{ {
result = llm.build_openelm(); result = llm.build_openelm();
@ -16155,7 +16737,7 @@ static int llama_decode_internal(
const uint32_t n_tokens_all = batch_all.n_tokens; const uint32_t n_tokens_all = batch_all.n_tokens;
if (n_tokens_all == 0) { if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1; return -1;
} }
@ -16168,7 +16750,7 @@ static int llama_decode_internal(
if (batch_all.token) { if (batch_all.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
return -1; return -1;
} }
} }
@ -16456,7 +17038,7 @@ static int llama_encode_internal(
const uint32_t n_tokens = batch.n_tokens; const uint32_t n_tokens = batch.n_tokens;
if (n_tokens == 0) { if (n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1; return -1;
} }
@ -16469,7 +17051,7 @@ static int llama_encode_internal(
if (batch.token) { if (batch.token) {
for (uint32_t i = 0; i < n_tokens; ++i) { for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1; return -1;
} }
} }
@ -18137,9 +18719,9 @@ struct llama_model * llama_load_model_from_file(
unsigned percentage = (unsigned) (100 * progress); unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) { while (percentage > *cur_percentage_p) {
*cur_percentage_p = percentage; *cur_percentage_p = percentage;
LLAMA_LOG_INFO("."); LLAMA_LOG(".");
if (percentage >= 100) { if (percentage >= 100) {
LLAMA_LOG_INFO("\n"); LLAMA_LOG("\n");
} }
} }
return true; return true;
@ -18611,6 +19193,10 @@ int32_t llama_n_layer(const struct llama_model * model) {
return model->hparams.n_layer; return model->hparams.n_layer;
} }
int32_t llama_n_head(const struct llama_model * model) {
return model->hparams.n_head();
}
const struct llama_model * llama_get_model(const struct llama_context * ctx) { const struct llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model; return &ctx->model;
} }
@ -18649,6 +19235,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_ARCTIC: case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM: case LLM_ARCH_CHATGLM:
case LLM_ARCH_GRANITE:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2
@ -18662,6 +19249,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3: case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA:
@ -18672,6 +19260,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_CODESHELL: case LLM_ARCH_CODESHELL:
case LLM_ARCH_NEMOTRON: case LLM_ARCH_NEMOTRON:
case LLM_ARCH_EXAONE: case LLM_ARCH_EXAONE:
case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX; return LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here // all model arches should be listed explicitly here
@ -20854,8 +21443,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
if (len < 128) { if (len < 128) {
g_state.log_callback(level, buffer, g_state.log_callback_user_data); g_state.log_callback(level, buffer, g_state.log_callback_user_data);
} else { } else {
char* buffer2 = new char[len+1]; char * buffer2 = new char[len + 1];
vsnprintf(buffer2, len+1, format, args_copy); vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0; buffer2[len] = 0;
g_state.log_callback(level, buffer2, g_state.log_callback_user_data); g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
delete[] buffer2; delete[] buffer2;

View file

@ -5,6 +5,7 @@
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"
#include <algorithm>
#include <cassert> #include <cassert>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>

View file

@ -85,7 +85,7 @@ int main(void) {
argv = {"binary_name", "--verbose"}; argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.verbosity == 1); assert(params.verbosity > 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

93
tests/test-barrier.cpp Normal file
View file

@ -0,0 +1,93 @@
#include "ggml.h"
#include "ggml-backend.h"
#include <chrono>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>
#define MAX_NARGS 2
int main(int argc, char *argv[]) {
int n_threads = 4;
int n_rounds = 100;
if (argc > 1) {
n_threads = std::atoi(argv[1]);
}
if (argc > 2) {
n_rounds = std::atoi(argv[2]);
}
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
// Create graph
struct ggml_cgraph * gf = ggml_new_graph(ctx);
// Lots of small, parallel ops where barriers in between will dominate
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
for (int i = 0; i < 1000; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
out = ggml_mul_mat(ctx, a, out);
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
out = ggml_mul_mat(ctx, d, out);
}
ggml_build_forward_expand(gf, out);
int n_nodes = ggml_graph_n_nodes(gf);
// Create threadpool
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
exit(1);
}
// Create compute plan
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size);
cplan.work_data = work_data.data();
std::cerr << "graph-compute with"
<< "\n n_threads: " << n_threads
<< "\n n_nodes: " << n_nodes
<< "\n n_rounds: " << n_rounds
<< "\n";
// ggml_graph_print(gf);
// Warmup
ggml_graph_compute(gf, &cplan);
auto t0 = std::chrono::high_resolution_clock::now();
for (int i=0; i < n_rounds; i++) {
ggml_graph_compute(gf, &cplan);
}
auto t1 = std::chrono::high_resolution_clock::now();
auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
std::cerr << "graph-compute took " << usec << " usec "
<< "\n " << (float) usec / n_rounds << " usec per-iter"
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
<< "\n";
ggml_threadpool_free(threadpool);
ggml_free(ctx);
return 0;
}

39
tests/test-log.cpp Normal file
View file

@ -0,0 +1,39 @@
#include "log.h"
#include <cstdlib>
#include <thread>
int main() {
const int n_thread = 8;
std::thread threads[n_thread];
for (int i = 0; i < n_thread; i++) {
threads[i] = std::thread([i]() {
const int n_msg = 1000;
for (int j = 0; j < n_msg; j++) {
const int log_type = std::rand() % 4;
switch (log_type) {
case 0: LOG_INF("Thread %d: %d\n", i, j); break;
case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
default:
break;
}
if (rand () % 10 < 5) {
gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
gpt_log_set_prefix (gpt_log_main(), rand() % 2);
}
}
});
}
for (int i = 0; i < n_thread; i++) {
threads[i].join();
}
return 0;
}