All samplers moved to kcpp side

This commit is contained in:
Concedo 2024-09-09 18:14:11 +08:00
commit b63158005f
54 changed files with 3765 additions and 2577 deletions

1
.gitignore vendored
View file

@ -21,6 +21,7 @@ gcovr-report/
build*/
out/
tmp/
autogen-*.md
models/*
models-mnt

File diff suppressed because it is too large Load diff

View file

@ -14,8 +14,10 @@
#include <vector>
#include <random>
#include <thread>
#include <set>
#include <unordered_map>
#include <tuple>
#include <functional>
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
@ -57,6 +59,25 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_COUNT,
};
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
@ -73,6 +94,8 @@ struct cpu_params {
};
struct gpt_params {
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_predict = -1; // new tokens to predict
@ -193,6 +216,7 @@ struct gpt_params {
bool kl_divergence = false; // compute KL divergence
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
bool special = false; // enable special token output
@ -214,7 +238,6 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
@ -303,13 +326,91 @@ struct gpt_params {
bool batched_bench_output_jsonl = false;
};
void gpt_params_parse_from_env(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples);
return *this;
}
llama_arg & set_env(const char * env) {
help = help + "\n(env: " + env + ")";
this->env = env;
return *this;
}
bool in_example(enum llama_example ex) {
return examples.find(ex) != examples.end();
}
bool get_value_from_env(std::string & output) const {
if (env == nullptr) return false;
char * value = std::getenv(env);
if (value) {
output = value;
return true;
}
return false;
}
bool has_value_from_env() const {
return env != nullptr && std::getenv(env);
}
std::string to_string();
};
// initialize list of options (arguments) that can be used by the current example
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
// optionally, we can provide "print_usage" to print example usage
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
std::string gpt_params_get_system_info(const gpt_params & params);

View file

@ -145,7 +145,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
/* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(params.n_prev),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
/* .cur_p = */ {},
};

View file

@ -28,9 +28,7 @@ static std::vector<int> parse_list(char * p) {
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -6,9 +6,7 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -80,8 +80,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -144,8 +144,8 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -391,9 +391,7 @@ struct lora_merge_ctx {
}
};
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n");
@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -0,0 +1,5 @@
set(TARGET llama-gen-docs)
add_executable(${TARGET} gen-docs.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,51 @@
#include "common.h"
#include <fstream>
#include <string>
// Export usage message (-h) to markdown format
static void export_md(std::string fname, llama_example ex) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
gpt_params params;
auto options = gpt_params_parser_init(params, ex);
file << "| Argument | Explanation |\n";
file << "| -------- | ----------- |\n";
for (auto & opt : options) {
file << "| `";
// args
for (const auto & arg : opt.args) {
if (arg == opt.args.front()) {
file << arg;
if (opt.args.size() > 1) file << ", ";
} else {
file << arg << (arg != opt.args.back() ? ", " : "");
}
}
// value hint
if (opt.value_hint) {
std::string md_value_hint(opt.value_hint);
string_replace_all(md_value_hint, "|", "\\|");
file << " " << md_value_hint;
}
if (opt.value_hint_2) {
std::string md_value_hint_2(opt.value_hint_2);
string_replace_all(md_value_hint_2, "|", "\\|");
file << " " << md_value_hint_2;
}
// help text
std::string md_help(opt.help);
string_replace_all(md_help, "\n", "<br/>");
string_replace_all(md_help, "|", "\\|");
file << "` | " << md_help << " |\n";
}
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
return 0;
}

View file

@ -154,8 +154,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -18,9 +18,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
@ -580,8 +578,8 @@ int main(int argc, char ** argv) {
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
return env->NewStringUTF(result.str().c_str());
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
return reinterpret_cast<jlong>(batch);
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = true;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
return reinterpret_cast<jlong>(smpl);
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@ -380,14 +397,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
jlong sampling_pointer,
jlong batch_pointer,
jlong sampler_pointer,
jint n_len,
jobject intvar_ncur
) {
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto sampling = reinterpret_cast<llama_sampler *>(sampling_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
const auto model = llama_get_model(context);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
@ -395,9 +412,9 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
// sample the most likely token
const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
llama_sampler_accept(sampling, new_token_id);
llama_sampler_accept(sampler, new_token_id);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {

View file

@ -45,8 +45,10 @@ class LLamaAndroid {
private external fun free_context(context: Long)
private external fun backend_init(numa: Boolean)
private external fun backend_free()
private external fun free_batch(batch: Long)
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
private external fun free_batch(batch: Long)
private external fun new_sampler(): Long
private external fun free_sampler(sampler: Long)
private external fun bench_model(
context: Long,
model: Long,
@ -69,6 +71,7 @@ class LLamaAndroid {
private external fun completion_loop(
context: Long,
batch: Long,
sampler: Long,
nLen: Int,
ncur: IntVar
): String?
@ -101,8 +104,11 @@ class LLamaAndroid {
val batch = new_batch(512, 0, 1)
if (batch == 0L) throw IllegalStateException("new_batch() failed")
val sampler = new_sampler()
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
Log.i(tag, "Loaded model $pathToModel")
threadLocalState.set(State.Loaded(model, context, batch))
threadLocalState.set(State.Loaded(model, context, batch, sampler))
}
else -> throw IllegalStateException("Model already loaded")
}
@ -114,7 +120,7 @@ class LLamaAndroid {
is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) {
break
}
@ -138,6 +144,7 @@ class LLamaAndroid {
free_context(state.context)
free_model(state.model)
free_batch(state.batch)
free_sampler(state.sampler);
threadLocalState.set(State.Idle)
}
@ -161,7 +168,7 @@ class LLamaAndroid {
private sealed interface State {
data object Idle: State
data class Loaded(val model: Long, val context: Long, val batch: Long): State
data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
}
// Enforce only one instance of Llm.

View file

@ -112,9 +112,7 @@ struct llava_context {
struct llama_model * model = NULL;
};
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
@ -280,8 +278,8 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}
@ -293,7 +291,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv, {});
print_usage(argc, argv);
return 1;
}
auto model = llava_init(&params);

View file

@ -253,8 +253,8 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}
@ -266,7 +266,6 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty())) {
gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}

View file

@ -36,8 +36,8 @@ struct ngram_container {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -13,8 +13,8 @@
int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -15,8 +15,8 @@
int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -12,8 +12,8 @@
int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -42,6 +42,13 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool need_insert_eot = false;
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
printf("\n");
}
static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str());
return f.good();
@ -132,9 +139,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -102,8 +102,8 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -6,9 +6,7 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n");
@ -21,8 +19,8 @@ int main(int argc, char ** argv) {
params.n_keep = 32;
params.i_pos = -1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -1968,8 +1968,8 @@ int main(int argc, char ** argv) {
params.n_ctx = 512;
params.logits_all = true;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |

View file

@ -4,9 +4,7 @@
#include <algorithm>
#include <fstream>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
LOG_TEE("\n");
@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -11,8 +11,8 @@ int main(int argc, char ** argv) {
params.prompt = "The quick brown fox";
params.sparams.seed = 1234;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -2424,14 +2424,11 @@ int main(int argc, char ** argv) {
// own arguments required by this example
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}
// parse arguments from environment variables
gpt_params_parse_from_env(params);
// TODO: not great to use extern vars
server_log_json = params.log_json;
server_verbose = params.verbosity > 0;

View file

@ -9,8 +9,11 @@ Feature: llama.cpp server
And a model alias bert-bge-small
And 42 as server seed
And 2 slots
And 1024 as batch size
And 1024 as ubatch size
# the bert-bge-small model has context size of 512
# since the generated prompts are as big as the batch size, we need to set the batch size to 512
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
And 512 as batch size
And 512 as ubatch size
And 2048 KV cache size
And embeddings extraction
Then the server is starting

View file

@ -6,9 +6,7 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
LOG_TEE("\n");
@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -29,8 +29,8 @@ struct seq_draft {
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
if (!gpt_params_parse(argc, argv, params, options)) {
return 1;
}

View file

@ -687,8 +687,8 @@ extern "C" {
struct ggml_hash_set {
size_t size;
ggml_bitset_t * used;
struct ggml_tensor ** keys;
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
};
// computation graph
@ -1278,7 +1278,7 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1288,19 +1288,19 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
size_t offset); // in bytes
GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
@ -1308,7 +1308,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1316,7 +1316,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
size_t offset); // in bytes
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(

View file

@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
default:
return true;
}

View file

@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
PROJECT_NAME = "llama.cpp"
PROJECT_NAME = "ggml"
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
@ -44,7 +44,7 @@ PROJECT_NUMBER =
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
PROJECT_BRIEF = "llama inference engine"
PROJECT_BRIEF = "Tensor library for machine learning"
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55

View file

@ -29,6 +29,7 @@ bool g_mul_mat_q = false;
#include "ggml-cuda/rope.cuh"
#include "ggml-cuda/scale.cuh"
#include "ggml-cuda/softmax.cuh"
#include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
@ -2184,6 +2185,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_dup(ctx, dst);
break;
case GGML_OP_ADD:
case GGML_OP_ADD1: // TODO: more efficient implementation
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
@ -2200,6 +2202,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_NEG:
ggml_cuda_op_neg(ctx, dst);
break;
case GGML_UNARY_OP_GELU:
ggml_cuda_op_gelu(ctx, dst);
break;
@ -2308,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst);
break;
case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
@ -2752,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
@ -2881,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
@ -2891,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
return true;
case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_2D:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:

View file

@ -1,6 +1,6 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include "sum.cuh"
#include <cmath>
#include <cstdint>
@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
}

View file

@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
} \
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[1];
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
} \
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[1];
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];

41
ggml/src/ggml-cuda/sum.cu Normal file
View file

@ -0,0 +1,41 @@
#include "sumrows.cuh"
#include "sum.cuh"
#include <cstdint>
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
#else
// Use (inefficient) sum_rows implementation as a fallback.
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
const int64_t ne = ggml_nelements(src0);
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
}

View file

@ -0,0 +1,5 @@
#include "common.cuh"
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -1,5 +1,15 @@
#include "unary.cuh"
static __global__ void neg_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = -x[i];
}
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
dst[i] = cosf(x[i]);
}
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;

View file

@ -1,5 +1,6 @@
#include "common.cuh"
#define CUDA_NEG_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
@ -12,6 +13,8 @@
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
return ctx->support_simdgroup_reduction;
case GGML_OP_NORM:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
return true;
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D:
case GGML_OP_POOL_2D:
return false;

View file

@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
SYCL_CHECK(
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
look_ahead_size, *qptr)));
if (!ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
return nullptr;
}
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
if (!dev_ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
return nullptr;
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
*/
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
size, *stream)));
if (!buf) {
char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
throw std::runtime_error(err_buf);
}
// set padding to 0 to avoid possible NaN values
if (size > original_size) {
/*

View file

@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
if (fence) {
ctx->q->queue.submit({}, fence);
}
return;
}
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun);
}
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}, dryrun);
}
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}, dryrun);
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
}
}
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
// Returns true if node has enqueued work into the queue, false otherwise
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
if (ggml_is_empty(node) || extra == nullptr) {
return;
return false;
}
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
return;
return false;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU:
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_UNARY_OP_TANH:
break;
default:
return;
return false;
}
break;
case GGML_OP_REPEAT:
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ABORT("fatal error");
return;
return false;
}
vk_context compute_ctx;
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break;
case GGML_OP_SIN:
ggml_vk_sin(ctx, compute_ctx, src0, node);
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_COS:
ggml_vk_cos(ctx, compute_ctx, src0, node);
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_CLAMP:
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
break;
default:
return;
return false;
}
break;
case GGML_OP_DIAG_MASK_INF:
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break;
default:
return;
return false;
}
if (dryrun) {
return;
return false;
}
ctx->tensor_ctxs[node_idx] = compute_ctx;
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
last_node = true;
#endif
if (last_node) {
if (submit || last_node) {
ggml_vk_ctx_end(compute_ctx);
compute_ctx->exit_tensor_idx = node_idx;
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
if (last_node) {
compute_ctx->exit_tensor_idx = node_idx_begin;
}
else {
compute_ctx->exit_tensor_idx = -1;
}
ctx->compute_ctx.reset();
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
}
else {
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
}
}
}
return true;
}
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
ggml_tensor_extra_gpu * extra = nullptr;
switch (tensor->op) {
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_0(tensor);
#endif
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
#ifdef GGML_VULKAN_PERF
std::chrono::steady_clock::time_point start;
#endif // GGML_VULKAN_PERF
// always wait for the GPU work to be done for the last submit
if (tensor_idx == subctx->exit_tensor_idx) {
use_fence = true;
}
// Only run if ctx hasn't been submitted yet
if (!subctx->seqs.empty()) {
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_0(tensor);
use_fence = true;
#endif
// Do staging buffer copies
for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
#ifdef GGML_VULKAN_PERF
start = std::chrono::steady_clock::now();
#endif // GGML_VULKAN_PERF
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
ggml_vk_submit(subctx, ctx->fence);
if (use_fence) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
ctx->device->device.resetFences({ ctx->fence });
}
#ifdef GGML_VULKAN_CHECK_RESULTS
ggml_vk_check_results_1(tensor);
#endif
}
if (tensor_idx == subctx->exit_tensor_idx) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
#ifdef GGML_VULKAN_PERF
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
ctx->device->perf_logger->log_timing(tensor, duration.count());
#endif // GGML_VULKAN_PERF
ctx->device->device.resetFences({ ctx->fence });
// Do staging buffer copies
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
}
ggml_vk_preallocate_buffers(ctx);
ggml_pipeline_allocate_descriptor_sets(ctx->device);
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
// Reserve tensor context space for all nodes
ctx->tensor_ctxs.resize(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
}
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
constexpr int submit_count = 100;
int submitted_nodes = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (ggml_vk_is_empty(node)) {
continue;
if (first_node_in_batch) {
submit_node_idx = i;
}
bool ok = ggml_vk_compute_forward(ctx, node, i);
if (!ok) {
if (node->op == GGML_OP_UNARY) {
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
} else {
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
if (enqueued) {
++submitted_nodes;
#ifndef GGML_VULKAN_CHECK_RESULTS
if (first_node_in_batch) {
first_node_in_batch = false;
}
}
#ifdef GGML_VULKAN_CHECK_RESULTS
else {
ggml_vk_check_results_1(node);
}
#endif
GGML_ASSERT(ok);
}
if (submit) {
first_node_in_batch = true;
submitted_nodes = 0;
}
}
#ifdef GGML_VULKAN_PERF
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
return false;
}
} break;
case GGML_OP_CONT:
case GGML_OP_CPY:
case GGML_OP_DUP:
{
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ARGSORT:

View file

@ -5291,6 +5291,7 @@ struct ggml_tensor * ggml_concat(
bool is_node = false;
if (a->grad || b->grad) {
GGML_ABORT("fatal error"); // TODO: implement
is_node = true;
}
@ -5412,6 +5413,7 @@ struct ggml_tensor * ggml_leaky_relu(
bool is_node = false;
if (!inplace && (a->grad)) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}
@ -5850,6 +5852,7 @@ static struct ggml_tensor * ggml_set_impl(
// make a view of the destination
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
GGML_ASSERT(offset < (size_t)(1 << 30));
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
ggml_set_op_params(result, params, sizeof(params));
@ -6807,14 +6810,12 @@ struct ggml_tensor * ggml_rope_back(
GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]);
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
bool is_node = false;
if (a->grad) {
is_node = false; // TODO: implement backward
GGML_ASSERT(false && "backwards pass not implemented");
is_node = false;
}
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
@ -6832,6 +6833,7 @@ struct ggml_tensor * ggml_rope_back(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result;
}
@ -7385,6 +7387,11 @@ struct ggml_tensor * ggml_argsort(
enum ggml_sort_order order) {
bool is_node = false;
if (a->grad) {
GGML_ABORT("fatal error"); // TODO: not implemented
is_node = true;
}
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
@ -8346,8 +8353,7 @@ static void ggml_compute_forward_dup_same_cont(
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == dst->type);
const size_t nb00 = src0->nb[0];
const size_t nb0 = dst->nb[0];
const size_t nb0 = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
@ -8361,8 +8367,8 @@ static void ggml_compute_forward_dup_same_cont(
if (ie0 < ie1) {
memcpy(
((char *) dst->data + ie0*nb0),
((char *) src0->data + ie0*nb00),
(ie1 - ie0) * ggml_type_size(src0->type));
((char *) src0->data + ie0*nb0),
(ie1 - ie0) * nb0);
}
}
@ -8379,11 +8385,6 @@ static void ggml_compute_forward_dup_f16(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@ -8648,11 +8649,6 @@ static void ggml_compute_forward_dup_bf16(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@ -9004,11 +9000,6 @@ static void ggml_compute_forward_dup_f32(
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
// parallelize by rows
const int nr = ne01;
// number of rows per thread
@ -9318,13 +9309,13 @@ static void ggml_compute_forward_dup_bytes(
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type);
GGML_TENSOR_UNARY_OP_LOCALS;
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, dst);
return;
}
GGML_TENSOR_UNARY_OP_LOCALS;
const size_t type_size = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
@ -11015,9 +11006,6 @@ static void ggml_compute_forward_sum_f32(
return;
}
assert(ggml_is_scalar(dst));
assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));
@ -13775,7 +13763,7 @@ static void ggml_compute_forward_get_rows_q(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
GGML_ASSERT(i01 >= 0 && i01 < ne01);
dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@ -13816,7 +13804,7 @@ static void ggml_compute_forward_get_rows_f16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_fp16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@ -13857,7 +13845,7 @@ static void ggml_compute_forward_get_rows_bf16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_bf16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@ -13898,7 +13886,7 @@ static void ggml_compute_forward_get_rows_f32(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
GGML_ASSERT(i01 >= 0 && i01 < ne01);
ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
@ -18426,14 +18414,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
if (src0->grad || src1->grad) {
GGML_ASSERT(src0->type == tensor->type);
GGML_ASSERT(tensor->grad->type == tensor->type);
GGML_ASSERT(tensor->grad->type == src1->grad->type);
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
tensor_grad_view = ggml_view_4d(ctx,
tensor->grad,
src1->grad->ne[0],
src1->grad->ne[1],
src1->grad->ne[2],
src1->grad->ne[3],
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
nb1, nb2, nb3, offset);
}
@ -18502,9 +18486,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
memcpy(&offset, tensor->op_params, sizeof(offset));
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
size_t nb1 = tensor->nb[1];
size_t nb2 = tensor->nb[2];
size_t nb3 = tensor->nb[3];
if (src0->type != src0->grad->type) {
// gradient is typically F32, but src0 could be other type
@ -19200,7 +19184,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
}
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
if (src->visited_hash_set.keys[i]) {
// copy all hashset keys (tensors) that are in use
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
}
}

View file

@ -1006,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
assert(nth > 0);
assert(ith < nth);
// only enable sgemm for prompt processing
if (n < 2)
return false;
if (Ctype != GGML_TYPE_F32)
return false;

View file

@ -401,9 +401,112 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_
}
}
// KCPP SAMPLING FUNCTIONS
void sample_softmax(llama_token_data_array * cur_p) {
GGML_ASSERT(cur_p->size > 0);
// Sort the logits in descending order
if (!cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
cur_p->sorted = true;
}
float max_l = cur_p->data[0].logit;
float cum_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
float p = expf(cur_p->data[i].logit - max_l);
cur_p->data[i].p = p;
cum_sum += p;
}
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= cum_sum;
}
}
void sample_top_k(llama_token_data_array * cur_p, int32_t k) {
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
// if (k >= (int32_t)cur_p->size) {
// return;
// }
if (k <= 0) {
k = cur_p->size;
}
k = std::max(k, (int) 1); //min keep of 1
k = std::min(k, (int) cur_p->size);
// Sort scores in descending order
if (!cur_p->sorted) {
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
};
if (k <= 128) {
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
} else {
constexpr int nbuckets = 128;
constexpr float bucket_low = -10.0f;
constexpr float bucket_high = 10.0f;
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
constexpr float bucket_inter = -bucket_low * bucket_scale;
std::vector<int> bucket_idx(cur_p->size);
std::vector<int> histo(nbuckets, 0);
for (int i = 0; i < (int)cur_p->size; ++i) {
const float val = cur_p->data[i].logit;
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
ib = std::max(0, std::min(nbuckets-1, ib));
bucket_idx[i] = ib;
++histo[ib];
}
int nhave = 0;
int ib = nbuckets - 1;
for ( ; ib >= 0; --ib) {
nhave += histo[ib];
if (nhave >= k) {
break;
}
}
std::vector<llama_token_data> tmp_tokens(nhave);
auto * ptr = tmp_tokens.data();
std::vector<llama_token_data*> bucket_ptrs;
bucket_ptrs.reserve(nbuckets - ib);
for (int j = nbuckets - 1; j >= ib; --j) {
bucket_ptrs.push_back(ptr);
ptr += histo[j];
}
for (int i = 0; i < (int)cur_p->size; ++i) {
int j = bucket_idx[i];
if (j >= ib) {
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
}
}
ptr = tmp_tokens.data();
int ndone = 0;
for (int j = nbuckets-1; j > ib; --j) {
std::sort(ptr, ptr + histo[j], comp);
ptr += histo[j];
ndone += histo[j];
}
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
}
cur_p->sorted = true;
}
cur_p->size = k;
}
llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
{
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
std::vector<float> probs;
probs.reserve(candidates->size);
top_picks.clear();
@ -433,7 +536,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
{
float N = float(n_vocab);
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
// Estimate s_hat using the most probable m tokens
float s_hat = 0.0;
float sum_ti_bi = 0.0;
@ -449,7 +552,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
float epsilon_hat = s_hat - 1;
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
// Sample the next word X using top-k sampling
llama_sampler_top_k_impl(candidates, int(k));
sample_top_k(candidates, int(k));
llama_token X = sample_token(candidates, rng); // Compute error as the difference between observed surprise and target surprise value
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
return candidate.id == X;
@ -463,7 +566,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
{
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
// Truncate the words with surprise values greater than mu
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
return -log2f(candidate.p) > *mu;
@ -474,7 +577,7 @@ llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::m
}
// Normalize the probabilities of the remaining words
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
// Sample the next word X from the remaining words
llama_token X = sample_token(candidates,rng);
@ -496,7 +599,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
return;
}
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
// Compute the cumulative probabilities
float maxprob = candidates->data[0].p;
@ -532,7 +635,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
return;
}
llama_sampler_softmax_impl(candidates);
sample_softmax(candidates);
//calculate how many tokens cross the xtc threshold
size_t last_idx = candidates->size;
@ -825,18 +928,292 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
}
void sample_top_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
if (p >= 1.0f) {
return;
}
sample_softmax(cur_p);
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = cur_p->size;
for (size_t i = 0; i < cur_p->size; ++i) {
cum_sum += cur_p->data[i].p;
// Check if the running sum is at least p or if we have kept at least min_keep tokens
// we set the last index to i+1 to indicate that the current iterate should be included in the set
if (cum_sum >= p && i + 1 >= min_keep) {
last_idx = i + 1;
break;
}
}
// Resize the output vector to keep only the top-p tokens
cur_p->size = last_idx;
}
void sample_min_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
if (p <= 0.0f || !cur_p->size) {
return;
}
bool min_p_applied = false;
// if the cur_p aren't sorted, try the unsorted implementation first
if (!cur_p->sorted) {
std::vector<llama_token_data> filtered_tokens;
float max_logit = -FLT_MAX;
for (size_t i = 0; i < cur_p->size; ++i) {
max_logit = std::max(max_logit, cur_p->data[i].logit);
}
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].logit >= min_logit) {
filtered_tokens.push_back(cur_p->data[i]);
}
}
// if we have enough values the operation was a success
if (filtered_tokens.size() >= min_keep) {
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
cur_p->size = filtered_tokens.size();
min_p_applied = true;
}
}
// if the cur_p are sorted or the unsorted implementation failed, use this implementation
if (!min_p_applied) {
// Sort the logits in descending order
if (!cur_p->sorted) {
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
return a.logit > b.logit;
});
cur_p->sorted = true;
}
const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max
size_t i = 1; // first token always matches
for (; i < cur_p->size; ++i) {
if (cur_p->data[i].logit < min_logit && i >= min_keep) {
break; // prob too small
}
}
// Resize the output vector to keep only the matching tokens
cur_p->size = i;
}
}
void sample_tail_free(llama_token_data_array * cur_p, float z, size_t min_keep) {
if (z >= 1.0f || cur_p->size <= 2) {
return;
}
sample_softmax(cur_p);
// Compute the first and second derivatives
std::vector<float> first_derivatives(cur_p->size - 1);
std::vector<float> second_derivatives(cur_p->size - 2);
for (size_t i = 0; i < first_derivatives.size(); ++i) {
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
}
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
}
// Calculate absolute value of second derivatives
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = std::abs(second_derivatives[i]);
}
// Normalize the second derivatives
{
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
}
float cum_sum = 0.0f;
size_t last_idx = cur_p->size;
for (size_t i = 0; i < second_derivatives.size(); ++i) {
cum_sum += second_derivatives[i];
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
if (cum_sum > z && i >= min_keep) {
last_idx = i;
break;
}
}
// Resize the output vector to keep only the tokens above the tail location
cur_p->size = last_idx;
}
void sampler_typical(llama_token_data_array * cur_p, float p, size_t min_keep) {
// Reference implementation:
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
if (p >= 1.0f) {
return;
}
// Compute the softmax of logits and calculate entropy
sample_softmax(cur_p);
float entropy = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
if(cur_p->data[i].p>0)
{
entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
}
}
// Compute the absolute difference between negative log probability and entropy for each candidate
std::vector<float> shifted_scores;
for (size_t i = 0; i < cur_p->size; ++i) {
float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
shifted_scores.push_back(shifted_score);
}
// Sort tokens based on the shifted_scores and their corresponding indices
std::vector<size_t> indices(cur_p->size);
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
return shifted_scores[a] < shifted_scores[b];
});
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = indices.size();
for (size_t i = 0; i < indices.size(); ++i) {
size_t idx = indices[i];
cum_sum += cur_p->data[idx].p;
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
if (cum_sum > p && i >= min_keep - 1) {
last_idx = i + 1;
break;
}
}
// Resize the output vector to keep only the locally typical tokens
std::vector<llama_token_data> cur_p_new;
for (size_t i = 0; i < last_idx; ++i) {
size_t idx = indices[i];
cur_p_new.push_back(cur_p->data[idx]);
}
// Replace the data in cur_p with the cur_p_new data
std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
cur_p->size = cur_p_new.size();
cur_p->sorted = false;
}
void sample_entropy(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
// no need to do anything if there is only one (or zero) candidates
if (cur_p->size <= 1) {
return;
}
// Calculate maximum possible entropy
float max_entropy = -logf(1.0f / cur_p->size);
sample_softmax(cur_p);
// Calculate entropy of the softmax probabilities
float entropy = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
float prob = cur_p->data[i].p;
if (prob > 0.0f) { // Ensure no log(0)
entropy -= prob * logf(prob);
}
}
// Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
float normalized_entropy = entropy / max_entropy;
// Map the normalized entropy to the desired temperature range using the power function
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
// Apply the dynamically calculated temperature scaling
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= dyn_temp;
}
// Re-compute softmax probabilities after scaling logits with dynamic temperature
const double max_l_double = cur_p->data[0].logit;
double cum_sum_double = 0.0;
for (size_t i = 0; i < cur_p->size; ++i) {
double p = exp(cur_p->data[i].logit - max_l_double);
cur_p->data[i].p = p; // Store the scaled probability
cum_sum_double += p;
}
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
}
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
if (smoothing_factor > 0 && cur_p->size > 1) {
sample_softmax(cur_p);
float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
// Apply quadratic transformation using the smoothing_factor
for (size_t i = 0; i < cur_p->size; ++i)
{
float logit_shifted = cur_p->data[i].logit - h;
cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
}
sample_softmax(cur_p);
}
}
void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor)
{
bool isgreedy = false;
if (temp <= 0)
{
// Imitate greedy sampling
temp = 0.00390625f; //cannot be zero else div0, this is 1/256
llama_sampler_temp_impl(candidates_p, temp, 0);
llama_sampler_top_k_impl(candidates_p, 1); //only want first candidate
smoothing_factor = 0;
isgreedy = true;
}
else
for (size_t i = 0; i < candidates_p->size; ++i) {
candidates_p->data[i].logit /= temp;
}
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
if (smoothing_factor > 0 && candidates_p->size > 1) {
sample_softmax(candidates_p);
float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
// Apply quadratic transformation using the smoothing_factor
for (size_t i = 0; i < candidates_p->size; ++i)
{
float logit_shifted = candidates_p->data[i].logit - h;
candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
}
sample_softmax(candidates_p);
}
if(isgreedy)
{
llama_sampler_temp_impl(candidates_p, temp, smoothing_factor);
sample_top_k(candidates_p, 1); //only want first candidate
}
}
@ -907,7 +1284,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
sample_dry(n_ctx, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_sequence_breakers, &candidates_p);
//prefilter to top 5k tokens for improved speed
llama_sampler_top_k_impl(&candidates_p, 5000);
sample_top_k(&candidates_p, 5000);
if (mirostat == 1 || mirostat == 2)
{
@ -931,20 +1308,20 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
switch (sampler_order[i])
{
case KCPP_SAMPLER_TOP_K:
llama_sampler_top_k_impl(&candidates_p, top_k);
sample_top_k(&candidates_p, top_k);
break;
case KCPP_SAMPLER_TOP_A:
sample_top_a(&candidates_p, top_a, 1);
break;
case KCPP_SAMPLER_TOP_P:
llama_sampler_top_p_impl(&candidates_p, top_p, 1);
llama_sampler_min_p_impl(&candidates_p, min_p, 1);
sample_top_p(&candidates_p, top_p, 1);
sample_min_p(&candidates_p, min_p, 1);
break;
case KCPP_SAMPLER_TFS:
llama_sampler_tail_free_impl(&candidates_p, tfs, 1);
sample_tail_free(&candidates_p, tfs, 1);
break;
case KCPP_SAMPLER_TYP:
llama_sampler_typical_impl(&candidates_p, typical_p, 1);
sampler_typical(&candidates_p, typical_p, 1);
break;
case KCPP_SAMPLER_TEMP:
if (dynatemp_range>0)
@ -955,7 +1332,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
llama_sampler_entropy_impl(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
sample_entropy(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
}
else
{

View file

@ -101,6 +101,10 @@ struct ring_buffer {
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;

File diff suppressed because it is too large Load diff

View file

@ -23,16 +23,6 @@ struct llama_sampler_chain {
mutable int32_t n_sample;
};
using llama_token_cnt = std::unordered_map<llama_token, int>;
// TODO: tmp exposed until test-sampling is fixed
void llama_sampler_penalties_impl(
llama_token_data_array * cur_p,
const llama_token_cnt & token_count,
float penalty_repeat,
float penalty_freq,
float penalty_present);
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,

View file

@ -6449,6 +6449,11 @@ static void llm_load_vocab(
)
) {
vocab.special_eot_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
break;
}
}
@ -6462,6 +6467,11 @@ static void llm_load_vocab(
const auto & t = vocab.token_to_id.find("<|eom_id|>");
if (t != vocab.token_to_id.end()) {
vocab.special_eom_id = t->second;
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t->first.c_str());
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
@ -16143,6 +16153,13 @@ static int llama_decode_internal(
return -1;
}
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
@ -16435,6 +16452,13 @@ static int llama_encode_internal(
return -1;
}
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;

119
tests/test-arg-parser.cpp Normal file
View file

@ -0,0 +1,119 @@
#include <string>
#include <vector>
#include <sstream>
#include <unordered_set>
#undef NDEBUG
#include <cassert>
#include "common.h"
int main(void) {
gpt_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
try {
auto options = gpt_params_parser_init(params, (enum llama_example)ex);
std::unordered_set<std::string> seen_args;
std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : options) {
// check for args duplications
for (const auto & arg : opt.args) {
if (seen_args.find(arg) == seen_args.end()) {
seen_args.insert(arg);
} else {
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
exit(1);
}
}
// check for env var duplications
if (opt.env) {
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
seen_env_vars.insert(opt.env);
} else {
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
exit(1);
}
}
}
} catch (std::exception & e) {
printf("%s\n", e.what());
assert(false);
}
}
auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
std::vector<char *> res;
for (auto & arg : argv) {
res.push_back(const_cast<char *>(arg.data()));
}
return res;
};
std::vector<std::string> argv;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
printf("test-arg-parser: test invalid usage\n\n");
argv = {"binary_name", "-m"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
argv = {"binary_name", "-ngl", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
argv = {"binary_name", "-sm", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
printf("test-arg-parser: test valid usage\n\n");
argv = {"binary_name", "-m", "model_file.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.model == "model_file.gguf");
argv = {"binary_name", "-t", "1234"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.cpuparams.n_threads == 1234);
argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.verbosity == 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.model == "abc.gguf");
assert(params.n_predict == 6789);
assert(params.n_batch == 9090);
// skip this part on windows, because setenv is not supported
#ifdef _WIN32
printf("test-arg-parser: skip on windows build\n");
#else
printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
setenv("LLAMA_ARG_THREADS", "blah", true);
argv = {"binary_name"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.model == "blah.gguf");
assert(params.cpuparams.n_threads == 1010);
printf("test-arg-parser: test environment variables being overwritten\n\n");
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name", "-m", "overwritten.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
assert(params.model == "overwritten.gguf");
assert(params.cpuparams.n_threads == 1010);
#endif // _WIN32
printf("test-arg-parser: all tests OK\n\n");
}