mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
All samplers moved to kcpp side
This commit is contained in:
commit
b63158005f
54 changed files with 3765 additions and 2577 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -21,6 +21,7 @@ gcovr-report/
|
|||
build*/
|
||||
out/
|
||||
tmp/
|
||||
autogen-*.md
|
||||
|
||||
models/*
|
||||
models-mnt
|
||||
|
|
3438
common/common.cpp
3438
common/common.cpp
File diff suppressed because it is too large
Load diff
115
common/common.h
115
common/common.h
|
@ -14,8 +14,10 @@
|
|||
#include <vector>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
#include <functional>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
|
@ -57,6 +59,25 @@ int32_t cpu_get_num_math();
|
|||
// CLI argument parsing
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
LLAMA_EXAMPLE_PASSKEY,
|
||||
LLAMA_EXAMPLE_IMATRIX,
|
||||
LLAMA_EXAMPLE_BENCH,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
|
@ -73,6 +94,8 @@ struct cpu_params {
|
|||
};
|
||||
|
||||
struct gpt_params {
|
||||
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
||||
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
@ -193,6 +216,7 @@ struct gpt_params {
|
|||
|
||||
bool kl_divergence = false; // compute KL divergence
|
||||
|
||||
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
||||
bool usage = false; // print usage
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool special = false; // enable special token output
|
||||
|
@ -214,7 +238,6 @@ struct gpt_params {
|
|||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
|
@ -303,13 +326,91 @@ struct gpt_params {
|
|||
bool batched_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
void gpt_params_parse_from_env(gpt_params & params);
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
struct llama_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
void (*handler_void) (gpt_params & params) = nullptr;
|
||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
}
|
||||
|
||||
llama_arg & set_env(const char * env) {
|
||||
help = help + "\n(env: " + env + ")";
|
||||
this->env = env;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool in_example(enum llama_example ex) {
|
||||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
|
||||
bool get_value_from_env(std::string & output) const {
|
||||
if (env == nullptr) return false;
|
||||
char * value = std::getenv(env);
|
||||
if (value) {
|
||||
output = value;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_value_from_env() const {
|
||||
return env != nullptr && std::getenv(env);
|
||||
}
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
// initialize list of options (arguments) that can be used by the current example
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
||||
// optionally, we can provide "print_usage" to print example usage
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
|
||||
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
||||
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
||||
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
|
|
|
@ -145,7 +145,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|||
/* .params = */ params,
|
||||
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<llama_token>(params.n_prev),
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
|
|
@ -28,9 +28,7 @@ static std::vector<int> parse_list(char * p) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
|
@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -6,9 +6,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
|
@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
|
|||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||
|
@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -80,8 +80,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -144,8 +144,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -391,9 +391,7 @@ struct lora_merge_ctx {
|
|||
}
|
||||
};
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
||||
printf("\nNOTE: output model is F16\n");
|
||||
|
@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
5
examples/gen-docs/CMakeLists.txt
Normal file
5
examples/gen-docs/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET llama-gen-docs)
|
||||
add_executable(${TARGET} gen-docs.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
51
examples/gen-docs/gen-docs.cpp
Normal file
51
examples/gen-docs/gen-docs.cpp
Normal file
|
@ -0,0 +1,51 @@
|
|||
#include "common.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
// Export usage message (-h) to markdown format
|
||||
|
||||
static void export_md(std::string fname, llama_example ex) {
|
||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||
|
||||
gpt_params params;
|
||||
auto options = gpt_params_parser_init(params, ex);
|
||||
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
for (auto & opt : options) {
|
||||
file << "| `";
|
||||
// args
|
||||
for (const auto & arg : opt.args) {
|
||||
if (arg == opt.args.front()) {
|
||||
file << arg;
|
||||
if (opt.args.size() > 1) file << ", ";
|
||||
} else {
|
||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
||||
}
|
||||
}
|
||||
// value hint
|
||||
if (opt.value_hint) {
|
||||
std::string md_value_hint(opt.value_hint);
|
||||
string_replace_all(md_value_hint, "|", "\\|");
|
||||
file << " " << md_value_hint;
|
||||
}
|
||||
if (opt.value_hint_2) {
|
||||
std::string md_value_hint_2(opt.value_hint_2);
|
||||
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||
file << " " << md_value_hint_2;
|
||||
}
|
||||
// help text
|
||||
std::string md_help(opt.help);
|
||||
string_replace_all(md_help, "\n", "<br/>");
|
||||
string_replace_all(md_help, "|", "\\|");
|
||||
file << "` | " << md_help << " |\n";
|
||||
}
|
||||
}
|
||||
|
||||
int main(int, char **) {
|
||||
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
|
||||
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -154,8 +154,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|||
int main(int argc, char * argv[]) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -18,9 +18,7 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s \\\n"
|
||||
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
||||
|
@ -580,8 +578,8 @@ int main(int argc, char ** argv) {
|
|||
params.logits_all = true;
|
||||
params.verbosity = 1;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||
return env->NewStringUTF(result.str().c_str());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
|||
return reinterpret_cast<jlong>(batch);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
sparams.no_perf = true;
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||
|
||||
return reinterpret_cast<jlong>(smpl);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
|
||||
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
|
@ -380,14 +397,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
jlong sampling_pointer,
|
||||
jlong batch_pointer,
|
||||
jlong sampler_pointer,
|
||||
jint n_len,
|
||||
jobject intvar_ncur
|
||||
) {
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto sampling = reinterpret_cast<llama_sampler *>(sampling_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
|
||||
const auto model = llama_get_model(context);
|
||||
|
||||
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||
|
@ -395,9 +412,9 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||
|
||||
// sample the most likely token
|
||||
const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
|
||||
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
||||
|
||||
llama_sampler_accept(sampling, new_token_id);
|
||||
llama_sampler_accept(sampler, new_token_id);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
|
|
|
@ -45,8 +45,10 @@ class LLamaAndroid {
|
|||
private external fun free_context(context: Long)
|
||||
private external fun backend_init(numa: Boolean)
|
||||
private external fun backend_free()
|
||||
private external fun free_batch(batch: Long)
|
||||
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||
private external fun free_batch(batch: Long)
|
||||
private external fun new_sampler(): Long
|
||||
private external fun free_sampler(sampler: Long)
|
||||
private external fun bench_model(
|
||||
context: Long,
|
||||
model: Long,
|
||||
|
@ -69,6 +71,7 @@ class LLamaAndroid {
|
|||
private external fun completion_loop(
|
||||
context: Long,
|
||||
batch: Long,
|
||||
sampler: Long,
|
||||
nLen: Int,
|
||||
ncur: IntVar
|
||||
): String?
|
||||
|
@ -101,8 +104,11 @@ class LLamaAndroid {
|
|||
val batch = new_batch(512, 0, 1)
|
||||
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||
|
||||
val sampler = new_sampler()
|
||||
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
|
||||
|
||||
Log.i(tag, "Loaded model $pathToModel")
|
||||
threadLocalState.set(State.Loaded(model, context, batch))
|
||||
threadLocalState.set(State.Loaded(model, context, batch, sampler))
|
||||
}
|
||||
else -> throw IllegalStateException("Model already loaded")
|
||||
}
|
||||
|
@ -114,7 +120,7 @@ class LLamaAndroid {
|
|||
is State.Loaded -> {
|
||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||
while (ncur.value <= nlen) {
|
||||
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
||||
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
|
||||
if (str == null) {
|
||||
break
|
||||
}
|
||||
|
@ -138,6 +144,7 @@ class LLamaAndroid {
|
|||
free_context(state.context)
|
||||
free_model(state.model)
|
||||
free_batch(state.batch)
|
||||
free_sampler(state.sampler);
|
||||
|
||||
threadLocalState.set(State.Idle)
|
||||
}
|
||||
|
@ -161,7 +168,7 @@ class LLamaAndroid {
|
|||
|
||||
private sealed interface State {
|
||||
data object Idle: State
|
||||
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
||||
data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
|
|
|
@ -112,9 +112,7 @@ struct llava_context {
|
|||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\n example usage:\n");
|
||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
|
@ -280,8 +278,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -293,7 +291,7 @@ int main(int argc, char ** argv) {
|
|||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv, {});
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
auto model = llava_init(¶ms);
|
||||
|
|
|
@ -253,8 +253,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
show_additional_info(argc, argv);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -266,7 +266,6 @@ int main(int argc, char ** argv) {
|
|||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -36,8 +36,8 @@ struct ngram_container {
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@
|
|||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -42,6 +42,13 @@ static std::vector<llama_token> * g_output_tokens;
|
|||
static bool is_interacting = false;
|
||||
static bool need_insert_eot = false;
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static bool file_exists(const std::string & path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
|
@ -132,9 +139,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -102,8 +102,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -6,9 +6,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
|
@ -21,8 +19,8 @@ int main(int argc, char ** argv) {
|
|||
params.n_keep = 32;
|
||||
params.i_pos = -1;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -1968,8 +1968,8 @@ int main(int argc, char ** argv) {
|
|||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
|||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
|
||||
|
||||
*(outdated)*
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|
|
|
@ -4,9 +4,7 @@
|
|||
#include <algorithm>
|
||||
#include <fstream>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
|
@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -11,8 +11,8 @@ int main(int argc, char ** argv) {
|
|||
params.prompt = "The quick brown fox";
|
||||
params.sparams.seed = 1234;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -2424,14 +2424,11 @@ int main(int argc, char ** argv) {
|
|||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// parse arguments from environment variables
|
||||
gpt_params_parse_from_env(params);
|
||||
|
||||
// TODO: not great to use extern vars
|
||||
server_log_json = params.log_json;
|
||||
server_verbose = params.verbosity > 0;
|
||||
|
|
|
@ -9,8 +9,11 @@ Feature: llama.cpp server
|
|||
And a model alias bert-bge-small
|
||||
And 42 as server seed
|
||||
And 2 slots
|
||||
And 1024 as batch size
|
||||
And 1024 as ubatch size
|
||||
# the bert-bge-small model has context size of 512
|
||||
# since the generated prompts are as big as the batch size, we need to set the batch size to 512
|
||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
|
||||
And 512 as batch size
|
||||
And 512 as ubatch size
|
||||
And 2048 KV cache size
|
||||
And embeddings extraction
|
||||
Then the server is starting
|
||||
|
|
|
@ -6,9 +6,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
|
@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
|
|||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,8 +29,8 @@ struct seq_draft {
|
|||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -687,8 +687,8 @@ extern "C" {
|
|||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used;
|
||||
struct ggml_tensor ** keys;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
|
@ -1278,7 +1278,7 @@ extern "C" {
|
|||
size_t nb1,
|
||||
size_t nb2,
|
||||
size_t nb3,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||
GGML_API struct ggml_tensor * ggml_set_inplace(
|
||||
|
@ -1288,19 +1288,19 @@ extern "C" {
|
|||
size_t nb1,
|
||||
size_t nb2,
|
||||
size_t nb3,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||
GGML_API struct ggml_tensor * ggml_set_2d(
|
||||
|
@ -1308,7 +1308,7 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t nb1,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
||||
|
@ -1316,7 +1316,7 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
size_t nb1,
|
||||
size_t offset);
|
||||
size_t offset); // in bytes
|
||||
|
||||
// a -> b, return view(b)
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
|
|
|
@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
|
|||
# title of most generated pages and in a few other places.
|
||||
# The default value is: My Project.
|
||||
|
||||
PROJECT_NAME = "llama.cpp"
|
||||
PROJECT_NAME = "ggml"
|
||||
|
||||
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
||||
# could be handy for archiving the generated documentation or if some version
|
||||
|
@ -44,7 +44,7 @@ PROJECT_NUMBER =
|
|||
# for a project that appears at the top of each page and should give viewer a
|
||||
# quick idea about the purpose of the project. Keep the description short.
|
||||
|
||||
PROJECT_BRIEF = "llama inference engine"
|
||||
PROJECT_BRIEF = "Tensor library for machine learning"
|
||||
|
||||
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
||||
# in the documentation. The maximum height of the logo should not exceed 55
|
||||
|
|
|
@ -29,6 +29,7 @@ bool g_mul_mat_q = false;
|
|||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
|
@ -2184,6 +2185,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
ggml_cuda_dup(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||
ggml_cuda_op_add(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUB:
|
||||
|
@ -2200,6 +2202,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
ggml_cuda_op_neg(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
ggml_cuda_op_gelu(ctx, dst);
|
||||
break;
|
||||
|
@ -2308,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_OP_POOL_2D:
|
||||
ggml_cuda_op_pool2d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUM:
|
||||
ggml_cuda_op_sum(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
|
@ -2752,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
|
@ -2881,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
|
@ -2891,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "common.cuh"
|
||||
#include "cross-entropy-loss.cuh"
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
|
||||
// Combine results from individual blocks:
|
||||
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
|
||||
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||
}
|
||||
|
|
|
@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
|||
} \
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * Q = dst->src[1];
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
|
||||
|
@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
|
|||
} \
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * Q = dst->src[1];
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
|
||||
|
|
41
ggml/src/ggml-cuda/sum.cu
Normal file
41
ggml/src/ggml-cuda/sum.cu
Normal file
|
@ -0,0 +1,41 @@
|
|||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
size_t tmp_size = 0;
|
||||
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
||||
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
|
||||
#else
|
||||
// Use (inefficient) sum_rows implementation as a fallback.
|
||||
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
||||
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
||||
GGML_UNUSED(pool);
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
|
||||
}
|
5
ggml/src/ggml-cuda/sum.cuh
Normal file
5
ggml/src/ggml-cuda/sum.cuh
Normal file
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
|
||||
|
||||
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -1,5 +1,15 @@
|
|||
#include "unary.cuh"
|
||||
|
||||
static __global__ void neg_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = -x[i];
|
||||
}
|
||||
|
||||
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
|
|||
dst[i] = cosf(x[i]);
|
||||
}
|
||||
|
||||
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
|
||||
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
|
@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
|||
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_NEG_BLOCK_SIZE 256
|
||||
#define CUDA_GELU_BLOCK_SIZE 256
|
||||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
|
@ -12,6 +13,8 @@
|
|||
#define CUDA_SIN_BLOCK_SIZE 256
|
||||
#define CUDA_COS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
|
@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
|
|||
return ctx->support_simdgroup_reduction;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
return false;
|
||||
|
|
|
@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
||||
look_ahead_size, *qptr)));
|
||||
if (!ptr) {
|
||||
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
|
||||
|
@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|||
void * dev_ptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
if (!dev_ptr) {
|
||||
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
|
||||
}
|
||||
|
@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
|
||||
if (!buf) {
|
||||
char err_buf[1024];
|
||||
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||
throw std::runtime_error(err_buf);
|
||||
}
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
/*
|
||||
|
|
|
@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
|
|||
|
||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
if (ctx->seqs.empty()) {
|
||||
if (fence) {
|
||||
ctx->q->queue.submit({}, fence);
|
||||
}
|
||||
return;
|
||||
}
|
||||
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
||||
|
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
|
@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
|
@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
0,
|
||||
0.0f, 0.0f,
|
||||
});
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
|
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
|
||||
|
||||
// Returns true if node has enqueued work into the queue, false otherwise
|
||||
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||
|
||||
if (ggml_is_empty(node) || extra == nullptr) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
|
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_NONE:
|
||||
return;
|
||||
return false;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
|
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_UNARY_OP_TANH:
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
|
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
vk_context compute_ctx;
|
||||
|
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node);
|
||||
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
|
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
|
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||
|
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
last_node = true;
|
||||
#endif
|
||||
|
||||
if (last_node) {
|
||||
if (submit || last_node) {
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
compute_ctx->exit_tensor_idx = node_idx;
|
||||
|
||||
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
|
||||
if (last_node) {
|
||||
compute_ctx->exit_tensor_idx = node_idx_begin;
|
||||
}
|
||||
else {
|
||||
compute_ctx->exit_tensor_idx = -1;
|
||||
}
|
||||
|
||||
ctx->compute_ctx.reset();
|
||||
|
||||
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
|
||||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
}
|
||||
else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
|
||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
|
||||
ggml_tensor_extra_gpu * extra = nullptr;
|
||||
|
||||
switch (tensor->op) {
|
||||
|
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
|
||||
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_0(tensor);
|
||||
#endif
|
||||
|
||||
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
std::chrono::steady_clock::time_point start;
|
||||
#endif // GGML_VULKAN_PERF
|
||||
// always wait for the GPU work to be done for the last submit
|
||||
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||
use_fence = true;
|
||||
}
|
||||
|
||||
// Only run if ctx hasn't been submitted yet
|
||||
if (!subctx->seqs.empty()) {
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_0(tensor);
|
||||
use_fence = true;
|
||||
#endif
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : subctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
start = std::chrono::steady_clock::now();
|
||||
#endif // GGML_VULKAN_PERF
|
||||
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
||||
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
if (use_fence) {
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
ggml_vk_check_results_1(tensor);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
|
||||
ctx->device->perf_logger->log_timing(tensor, duration.count());
|
||||
#endif // GGML_VULKAN_PERF
|
||||
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : subctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
|
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||
}
|
||||
ggml_vk_preallocate_buffers(ctx);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|||
// Reserve tensor context space for all nodes
|
||||
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
|
||||
}
|
||||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||
int submit_node_idx = 0; // index to first node in a batch
|
||||
|
||||
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
|
||||
constexpr int submit_count = 100;
|
||||
int submitted_nodes = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (ggml_vk_is_empty(node)) {
|
||||
continue;
|
||||
if (first_node_in_batch) {
|
||||
submit_node_idx = i;
|
||||
}
|
||||
|
||||
bool ok = ggml_vk_compute_forward(ctx, node, i);
|
||||
if (!ok) {
|
||||
if (node->op == GGML_OP_UNARY) {
|
||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||
} else {
|
||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
|
||||
|
||||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
||||
|
||||
if (enqueued) {
|
||||
++submitted_nodes;
|
||||
|
||||
#ifndef GGML_VULKAN_CHECK_RESULTS
|
||||
if (first_node_in_batch) {
|
||||
first_node_in_batch = false;
|
||||
}
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
else {
|
||||
ggml_vk_check_results_1(node);
|
||||
}
|
||||
#endif
|
||||
GGML_ASSERT(ok);
|
||||
}
|
||||
|
||||
if (submit) {
|
||||
first_node_in_batch = true;
|
||||
submitted_nodes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
|
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
|
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
|
|
@ -5291,6 +5291,7 @@ struct ggml_tensor * ggml_concat(
|
|||
bool is_node = false;
|
||||
|
||||
if (a->grad || b->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
|
@ -5412,6 +5413,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad)) {
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
|
@ -5850,6 +5852,7 @@ static struct ggml_tensor * ggml_set_impl(
|
|||
// make a view of the destination
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
GGML_ASSERT(offset < (size_t)(1 << 30));
|
||||
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
|
@ -6807,14 +6810,12 @@ struct ggml_tensor * ggml_rope_back(
|
|||
GGML_ASSERT(ggml_is_vector(b));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
|
||||
|
||||
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
is_node = false; // TODO: implement backward
|
||||
GGML_ASSERT(false && "backwards pass not implemented");
|
||||
is_node = false;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
@ -6832,6 +6833,7 @@ struct ggml_tensor * ggml_rope_back(
|
|||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -7385,6 +7387,11 @@ struct ggml_tensor * ggml_argsort(
|
|||
enum ggml_sort_order order) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||
|
@ -8346,8 +8353,7 @@ static void ggml_compute_forward_dup_same_cont(
|
|||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb0 = dst->nb[0];
|
||||
const size_t nb0 = ggml_type_size(src0->type);
|
||||
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
@ -8361,8 +8367,8 @@ static void ggml_compute_forward_dup_same_cont(
|
|||
if (ie0 < ie1) {
|
||||
memcpy(
|
||||
((char *) dst->data + ie0*nb0),
|
||||
((char *) src0->data + ie0*nb00),
|
||||
(ie1 - ie0) * ggml_type_size(src0->type));
|
||||
((char *) src0->data + ie0*nb0),
|
||||
(ie1 - ie0) * nb0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8379,11 +8385,6 @@ static void ggml_compute_forward_dup_f16(
|
|||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
|
@ -8648,11 +8649,6 @@ static void ggml_compute_forward_dup_bf16(
|
|||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
|
@ -9004,11 +9000,6 @@ static void ggml_compute_forward_dup_f32(
|
|||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by rows
|
||||
const int nr = ne01;
|
||||
// number of rows per thread
|
||||
|
@ -9318,13 +9309,13 @@ static void ggml_compute_forward_dup_bytes(
|
|||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
ggml_compute_forward_dup_same_cont(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
const size_t type_size = ggml_type_size(src0->type);
|
||||
const int ith = params->ith; // thread index
|
||||
const int nth = params->nth; // number of threads
|
||||
|
@ -11015,9 +11006,6 @@ static void ggml_compute_forward_sum_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
assert(ggml_is_scalar(dst));
|
||||
|
||||
|
||||
assert(ggml_is_scalar(dst));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
|
@ -13775,7 +13763,7 @@ static void ggml_compute_forward_get_rows_q(
|
|||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
assert(i01 >= 0 && i01 < ne01);
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
|
@ -13816,7 +13804,7 @@ static void ggml_compute_forward_get_rows_f16(
|
|||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
assert(i01 >= 0 && i01 < ne01);
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_fp16_to_fp32_row(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
|
@ -13857,7 +13845,7 @@ static void ggml_compute_forward_get_rows_bf16(
|
|||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
assert(i01 >= 0 && i01 < ne01);
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_bf16_to_fp32_row(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
|
@ -13898,7 +13886,7 @@ static void ggml_compute_forward_get_rows_f32(
|
|||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||
|
||||
assert(i01 >= 0 && i01 < ne01);
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
ggml_vec_cpy_f32(nc,
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
||||
|
@ -18426,14 +18414,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
if (src0->grad || src1->grad) {
|
||||
GGML_ASSERT(src0->type == tensor->type);
|
||||
GGML_ASSERT(tensor->grad->type == tensor->type);
|
||||
GGML_ASSERT(tensor->grad->type == src1->grad->type);
|
||||
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
|
||||
|
||||
tensor_grad_view = ggml_view_4d(ctx,
|
||||
tensor->grad,
|
||||
src1->grad->ne[0],
|
||||
src1->grad->ne[1],
|
||||
src1->grad->ne[2],
|
||||
src1->grad->ne[3],
|
||||
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
nb1, nb2, nb3, offset);
|
||||
}
|
||||
|
||||
|
@ -18502,9 +18486,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
|
||||
memcpy(&offset, tensor->op_params, sizeof(offset));
|
||||
|
||||
size_t nb1 = tensor->nb[1];
|
||||
size_t nb2 = tensor->nb[2];
|
||||
size_t nb3 = tensor->nb[3];
|
||||
size_t nb1 = tensor->nb[1];
|
||||
size_t nb2 = tensor->nb[2];
|
||||
size_t nb3 = tensor->nb[3];
|
||||
|
||||
if (src0->type != src0->grad->type) {
|
||||
// gradient is typically F32, but src0 could be other type
|
||||
|
@ -19200,7 +19184,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
|
||||
if (src->visited_hash_set.keys[i]) {
|
||||
// copy all hashset keys (tensors) that are in use
|
||||
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
|
||||
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1006,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
if (n < 2)
|
||||
return false;
|
||||
|
||||
if (Ctype != GGML_TYPE_F32)
|
||||
return false;
|
||||
|
||||
|
|
|
@ -401,9 +401,112 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_
|
|||
}
|
||||
}
|
||||
|
||||
// KCPP SAMPLING FUNCTIONS
|
||||
void sample_softmax(llama_token_data_array * cur_p) {
|
||||
GGML_ASSERT(cur_p->size > 0);
|
||||
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
float max_l = cur_p->data[0].logit;
|
||||
float cum_sum = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
float p = expf(cur_p->data[i].logit - max_l);
|
||||
cur_p->data[i].p = p;
|
||||
cum_sum += p;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= cum_sum;
|
||||
}
|
||||
}
|
||||
|
||||
void sample_top_k(llama_token_data_array * cur_p, int32_t k) {
|
||||
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
||||
// if (k >= (int32_t)cur_p->size) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
if (k <= 0) {
|
||||
k = cur_p->size;
|
||||
}
|
||||
|
||||
k = std::max(k, (int) 1); //min keep of 1
|
||||
k = std::min(k, (int) cur_p->size);
|
||||
|
||||
// Sort scores in descending order
|
||||
if (!cur_p->sorted) {
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
if (k <= 128) {
|
||||
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
|
||||
} else {
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx(cur_p->size);
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
const float val = cur_p->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= k) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||
auto * ptr = tmp_tokens.data();
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets-1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||
|
||||
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||
|
||||
}
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
cur_p->size = k;
|
||||
}
|
||||
|
||||
llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
|
||||
{
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
std::vector<float> probs;
|
||||
probs.reserve(candidates->size);
|
||||
top_picks.clear();
|
||||
|
@ -433,7 +536,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
|
|||
llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
|
||||
{
|
||||
float N = float(n_vocab);
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
// Estimate s_hat using the most probable m tokens
|
||||
float s_hat = 0.0;
|
||||
float sum_ti_bi = 0.0;
|
||||
|
@ -449,7 +552,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
|
|||
float epsilon_hat = s_hat - 1;
|
||||
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sampler_top_k_impl(candidates, int(k));
|
||||
sample_top_k(candidates, int(k));
|
||||
llama_token X = sample_token(candidates, rng); // Compute error as the difference between observed surprise and target surprise value
|
||||
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
||||
return candidate.id == X;
|
||||
|
@ -463,7 +566,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
|
|||
|
||||
llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
|
||||
{
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
// Truncate the words with surprise values greater than mu
|
||||
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
||||
return -log2f(candidate.p) > *mu;
|
||||
|
@ -474,7 +577,7 @@ llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::m
|
|||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
// Sample the next word X from the remaining words
|
||||
llama_token X = sample_token(candidates,rng);
|
||||
|
||||
|
@ -496,7 +599,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
|
|||
return;
|
||||
}
|
||||
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float maxprob = candidates->data[0].p;
|
||||
|
@ -532,7 +635,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
|
|||
return;
|
||||
}
|
||||
|
||||
llama_sampler_softmax_impl(candidates);
|
||||
sample_softmax(candidates);
|
||||
|
||||
//calculate how many tokens cross the xtc threshold
|
||||
size_t last_idx = candidates->size;
|
||||
|
@ -825,18 +928,292 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
|
|||
|
||||
}
|
||||
|
||||
void sample_top_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||
if (p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
sample_softmax(cur_p);
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = cur_p->size;
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cum_sum += cur_p->data[i].p;
|
||||
|
||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||
if (cum_sum >= p && i + 1 >= min_keep) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the top-p tokens
|
||||
cur_p->size = last_idx;
|
||||
}
|
||||
|
||||
void sample_min_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||
if (p <= 0.0f || !cur_p->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool min_p_applied = false;
|
||||
|
||||
// if the cur_p aren't sorted, try the unsorted implementation first
|
||||
if (!cur_p->sorted) {
|
||||
std::vector<llama_token_data> filtered_tokens;
|
||||
|
||||
float max_logit = -FLT_MAX;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
max_logit = std::max(max_logit, cur_p->data[i].logit);
|
||||
}
|
||||
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit >= min_logit) {
|
||||
filtered_tokens.push_back(cur_p->data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (filtered_tokens.size() >= min_keep) {
|
||||
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
cur_p->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if the cur_p are sorted or the unsorted implementation failed, use this implementation
|
||||
if (!min_p_applied) {
|
||||
// Sort the logits in descending order
|
||||
if (!cur_p->sorted) {
|
||||
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
cur_p->sorted = true;
|
||||
}
|
||||
|
||||
const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||
size_t i = 1; // first token always matches
|
||||
|
||||
for (; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit < min_logit && i >= min_keep) {
|
||||
break; // prob too small
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the matching tokens
|
||||
cur_p->size = i;
|
||||
}
|
||||
}
|
||||
|
||||
void sample_tail_free(llama_token_data_array * cur_p, float z, size_t min_keep) {
|
||||
if (z >= 1.0f || cur_p->size <= 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
sample_softmax(cur_p);
|
||||
|
||||
// Compute the first and second derivatives
|
||||
std::vector<float> first_derivatives(cur_p->size - 1);
|
||||
std::vector<float> second_derivatives(cur_p->size - 2);
|
||||
|
||||
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
||||
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
|
||||
}
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
||||
}
|
||||
|
||||
// Calculate absolute value of second derivatives
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = std::abs(second_derivatives[i]);
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
{
|
||||
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
|
||||
if (second_derivatives_sum > 1e-6f) {
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
}
|
||||
} else {
|
||||
for (float & value : second_derivatives) {
|
||||
value = 1.0f / second_derivatives.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = cur_p->size;
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
cum_sum += second_derivatives[i];
|
||||
|
||||
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
||||
if (cum_sum > z && i >= min_keep) {
|
||||
last_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the tokens above the tail location
|
||||
cur_p->size = last_idx;
|
||||
}
|
||||
|
||||
void sampler_typical(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||
// Reference implementation:
|
||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||
if (p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Compute the softmax of logits and calculate entropy
|
||||
sample_softmax(cur_p);
|
||||
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if(cur_p->data[i].p>0)
|
||||
{
|
||||
entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the absolute difference between negative log probability and entropy for each candidate
|
||||
std::vector<float> shifted_scores;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
|
||||
shifted_scores.push_back(shifted_score);
|
||||
}
|
||||
|
||||
// Sort tokens based on the shifted_scores and their corresponding indices
|
||||
std::vector<size_t> indices(cur_p->size);
|
||||
std::iota(indices.begin(), indices.end(), 0);
|
||||
|
||||
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
||||
return shifted_scores[a] < shifted_scores[b];
|
||||
});
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = indices.size();
|
||||
|
||||
for (size_t i = 0; i < indices.size(); ++i) {
|
||||
size_t idx = indices[i];
|
||||
cum_sum += cur_p->data[idx].p;
|
||||
|
||||
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
||||
if (cum_sum > p && i >= min_keep - 1) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the locally typical tokens
|
||||
std::vector<llama_token_data> cur_p_new;
|
||||
for (size_t i = 0; i < last_idx; ++i) {
|
||||
size_t idx = indices[i];
|
||||
cur_p_new.push_back(cur_p->data[idx]);
|
||||
}
|
||||
|
||||
// Replace the data in cur_p with the cur_p_new data
|
||||
std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
|
||||
cur_p->size = cur_p_new.size();
|
||||
cur_p->sorted = false;
|
||||
}
|
||||
|
||||
void sample_entropy(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
|
||||
// no need to do anything if there is only one (or zero) candidates
|
||||
if (cur_p->size <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / cur_p->size);
|
||||
|
||||
sample_softmax(cur_p);
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
float prob = cur_p->data[i].p;
|
||||
if (prob > 0.0f) { // Ensure no log(0)
|
||||
entropy -= prob * logf(prob);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
|
||||
float normalized_entropy = entropy / max_entropy;
|
||||
|
||||
// Map the normalized entropy to the desired temperature range using the power function
|
||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||
|
||||
// Apply the dynamically calculated temperature scaling
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].logit /= dyn_temp;
|
||||
}
|
||||
|
||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||
const double max_l_double = cur_p->data[0].logit;
|
||||
|
||||
double cum_sum_double = 0.0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
double p = exp(cur_p->data[i].logit - max_l_double);
|
||||
cur_p->data[i].p = p; // Store the scaled probability
|
||||
cum_sum_double += p;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||
}
|
||||
|
||||
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
|
||||
if (smoothing_factor > 0 && cur_p->size > 1) {
|
||||
sample_softmax(cur_p);
|
||||
float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
|
||||
// Apply quadratic transformation using the smoothing_factor
|
||||
for (size_t i = 0; i < cur_p->size; ++i)
|
||||
{
|
||||
float logit_shifted = cur_p->data[i].logit - h;
|
||||
cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
|
||||
}
|
||||
sample_softmax(cur_p);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor)
|
||||
{
|
||||
bool isgreedy = false;
|
||||
if (temp <= 0)
|
||||
{
|
||||
// Imitate greedy sampling
|
||||
temp = 0.00390625f; //cannot be zero else div0, this is 1/256
|
||||
llama_sampler_temp_impl(candidates_p, temp, 0);
|
||||
llama_sampler_top_k_impl(candidates_p, 1); //only want first candidate
|
||||
smoothing_factor = 0;
|
||||
isgreedy = true;
|
||||
}
|
||||
else
|
||||
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].logit /= temp;
|
||||
}
|
||||
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
|
||||
if (smoothing_factor > 0 && candidates_p->size > 1) {
|
||||
sample_softmax(candidates_p);
|
||||
float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
|
||||
// Apply quadratic transformation using the smoothing_factor
|
||||
for (size_t i = 0; i < candidates_p->size; ++i)
|
||||
{
|
||||
float logit_shifted = candidates_p->data[i].logit - h;
|
||||
candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
|
||||
}
|
||||
sample_softmax(candidates_p);
|
||||
}
|
||||
|
||||
if(isgreedy)
|
||||
{
|
||||
llama_sampler_temp_impl(candidates_p, temp, smoothing_factor);
|
||||
sample_top_k(candidates_p, 1); //only want first candidate
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -907,7 +1284,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
|
|||
sample_dry(n_ctx, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_sequence_breakers, &candidates_p);
|
||||
|
||||
//prefilter to top 5k tokens for improved speed
|
||||
llama_sampler_top_k_impl(&candidates_p, 5000);
|
||||
sample_top_k(&candidates_p, 5000);
|
||||
|
||||
if (mirostat == 1 || mirostat == 2)
|
||||
{
|
||||
|
@ -931,20 +1308,20 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
|
|||
switch (sampler_order[i])
|
||||
{
|
||||
case KCPP_SAMPLER_TOP_K:
|
||||
llama_sampler_top_k_impl(&candidates_p, top_k);
|
||||
sample_top_k(&candidates_p, top_k);
|
||||
break;
|
||||
case KCPP_SAMPLER_TOP_A:
|
||||
sample_top_a(&candidates_p, top_a, 1);
|
||||
break;
|
||||
case KCPP_SAMPLER_TOP_P:
|
||||
llama_sampler_top_p_impl(&candidates_p, top_p, 1);
|
||||
llama_sampler_min_p_impl(&candidates_p, min_p, 1);
|
||||
sample_top_p(&candidates_p, top_p, 1);
|
||||
sample_min_p(&candidates_p, min_p, 1);
|
||||
break;
|
||||
case KCPP_SAMPLER_TFS:
|
||||
llama_sampler_tail_free_impl(&candidates_p, tfs, 1);
|
||||
sample_tail_free(&candidates_p, tfs, 1);
|
||||
break;
|
||||
case KCPP_SAMPLER_TYP:
|
||||
llama_sampler_typical_impl(&candidates_p, typical_p, 1);
|
||||
sampler_typical(&candidates_p, typical_p, 1);
|
||||
break;
|
||||
case KCPP_SAMPLER_TEMP:
|
||||
if (dynatemp_range>0)
|
||||
|
@ -955,7 +1332,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
|
|||
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
||||
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
||||
dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
|
||||
llama_sampler_entropy_impl(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
|
||||
sample_entropy(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -101,6 +101,10 @@ struct ring_buffer {
|
|||
}
|
||||
|
||||
void push_back(const T & value) {
|
||||
if (capacity == 0) {
|
||||
throw std::runtime_error("ring buffer: capacity is zero");
|
||||
}
|
||||
|
||||
if (sz == capacity) {
|
||||
// advance the start when buffer is full
|
||||
first = (first + 1) % capacity;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -23,16 +23,6 @@ struct llama_sampler_chain {
|
|||
mutable int32_t n_sample;
|
||||
};
|
||||
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
|
||||
// TODO: tmp exposed until test-sampling is fixed
|
||||
void llama_sampler_penalties_impl(
|
||||
llama_token_data_array * cur_p,
|
||||
const llama_token_cnt & token_count,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
|
|
|
@ -6449,6 +6449,11 @@ static void llm_load_vocab(
|
|||
)
|
||||
) {
|
||||
vocab.special_eot_id = t.second;
|
||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t.first.c_str());
|
||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -6462,6 +6467,11 @@ static void llm_load_vocab(
|
|||
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
||||
if (t != vocab.token_to_id.end()) {
|
||||
vocab.special_eom_id = t->second;
|
||||
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||
__func__, t->first.c_str());
|
||||
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -16143,6 +16153,13 @@ static int llama_decode_internal(
|
|||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
@ -16435,6 +16452,13 @@ static int llama_encode_internal(
|
|||
return -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
|
119
tests/test-arg-parser.cpp
Normal file
119
tests/test-arg-parser.cpp
Normal file
|
@ -0,0 +1,119 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <unordered_set>
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int main(void) {
|
||||
gpt_params params;
|
||||
|
||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||
try {
|
||||
auto options = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||
std::unordered_set<std::string> seen_args;
|
||||
std::unordered_set<std::string> seen_env_vars;
|
||||
for (const auto & opt : options) {
|
||||
// check for args duplications
|
||||
for (const auto & arg : opt.args) {
|
||||
if (seen_args.find(arg) == seen_args.end()) {
|
||||
seen_args.insert(arg);
|
||||
} else {
|
||||
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// check for env var duplications
|
||||
if (opt.env) {
|
||||
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
|
||||
seen_env_vars.insert(opt.env);
|
||||
} else {
|
||||
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
printf("%s\n", e.what());
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
|
||||
auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
|
||||
std::vector<char *> res;
|
||||
for (auto & arg : argv) {
|
||||
res.push_back(const_cast<char *>(arg.data()));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
std::vector<std::string> argv;
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
|
||||
printf("test-arg-parser: test invalid usage\n\n");
|
||||
|
||||
argv = {"binary_name", "-m"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
|
||||
argv = {"binary_name", "-ngl", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
|
||||
argv = {"binary_name", "-sm", "hello"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
|
||||
|
||||
printf("test-arg-parser: test valid usage\n\n");
|
||||
|
||||
argv = {"binary_name", "-m", "model_file.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.model == "model_file.gguf");
|
||||
|
||||
argv = {"binary_name", "-t", "1234"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.cpuparams.n_threads == 1234);
|
||||
|
||||
argv = {"binary_name", "--verbose"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.verbosity == 1);
|
||||
|
||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.model == "abc.gguf");
|
||||
assert(params.n_predict == 6789);
|
||||
assert(params.n_batch == 9090);
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
#ifdef _WIN32
|
||||
printf("test-arg-parser: skip on windows build\n");
|
||||
#else
|
||||
printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
|
||||
|
||||
setenv("LLAMA_ARG_THREADS", "blah", true);
|
||||
argv = {"binary_name"};
|
||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
|
||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.model == "blah.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
|
||||
|
||||
printf("test-arg-parser: test environment variables being overwritten\n\n");
|
||||
|
||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||
argv = {"binary_name", "-m", "overwritten.gguf"};
|
||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
||||
assert(params.model == "overwritten.gguf");
|
||||
assert(params.cpuparams.n_threads == 1010);
|
||||
#endif // _WIN32
|
||||
|
||||
|
||||
printf("test-arg-parser: all tests OK\n\n");
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue