Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	README.md
#	llama.cpp
#	scripts/sync-ggml-am.sh
#	scripts/sync-ggml.last
#	scripts/sync-ggml.sh
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2024-05-17 22:30:41 +08:00
commit 47cbfd6150
50 changed files with 4746 additions and 923 deletions

45
CMakePresets.json Normal file
View file

@ -0,0 +1,45 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
]
}

View file

@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -0,0 +1,6 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

View file

@ -1061,6 +1061,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.rpc_servers = argv[i];
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
@ -1558,6 +1566,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
}
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
@ -1831,6 +1840,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
@ -2544,7 +2554,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
size_t pos_start = 0;
size_t pos_found = 0;
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");

View file

@ -78,6 +78,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

View file

@ -26,7 +26,7 @@ namespace grammar_parser {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second;
}

View file

@ -272,7 +272,7 @@ private:
if (literal.empty()) {
return false;
}
ret.push_back(std::make_pair(literal, true));
ret.emplace_back(literal, true);
literal.clear();
return true;
};
@ -298,7 +298,7 @@ private:
while (i < length) {
char c = sub_pattern[i];
if (c == '.') {
seq.push_back(std::make_pair(get_dot(), false));
seq.emplace_back(get_dot(), false);
i++;
} else if (c == '(') {
i++;
@ -307,7 +307,7 @@ private:
_warnings.push_back("Unsupported pattern syntax");
}
}
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') {
i++;
if (start > 0 && sub_pattern[start - 1] != '(') {
@ -331,9 +331,9 @@ private:
}
square_brackets += ']';
i++;
seq.push_back(std::make_pair(square_brackets, false));
seq.emplace_back(square_brackets, false);
} else if (c == '|') {
seq.push_back(std::make_pair("|", false));
seq.emplace_back("|", false);
i++;
} else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@ -417,7 +417,7 @@ private:
}
}
if (!literal.empty()) {
seq.push_back(std::make_pair(literal, true));
seq.emplace_back(literal, true);
}
}
}

View file

@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_FLF_FMT "%s"
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif
#else
#define LOG_TEE_FLF_FMT "%s"
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Main LOG macro.
// behaves like printf, and supports arguments the exact same way.
//
#ifndef _MSC_VER
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET
//
#ifndef _MSC_VER
#if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
#ifndef _MSC_VER
#if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else

View file

@ -20,11 +20,13 @@
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
#
import logging
import os
import pathlib
import re
import requests
import sys
import json
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
class TOKENIZER_TYPE(IntEnum):
@ -79,63 +82,44 @@ models = [
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
else:
logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models
for model in models:
def download_model(model):
name = model["name"]
repo = model["repo"]
tokt = model["tokt"]
if not os.path.exists(f"models/tokenizers/{name}"):
os.makedirs(f"models/tokenizers/{name}")
else:
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue
logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
# if downloaded file is less than 1KB, we likely need to download an LFS instead
if os.path.getsize(save_path) < 1024:
# remove the file
os.remove(save_path)
url = f"{repo}/resolve/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model"
save_path = f"models/tokenizers/{name}/tokenizer.model"
download_file_with_auth(url, token, save_path)
files.append("tokenizer.model")
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
url = f"{repo}/raw/main/tokenizer_config.json"
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = ""
for model in models:
@ -224,11 +208,18 @@ src_func = f"""
return res
"""
print(src_func) # noqa: NP100
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
logger.info("\n")
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n")
convert_py_pth.write_text(convert_py)
logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model

View file

@ -402,6 +402,7 @@ class Model:
# NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
# Marker: Start get_vocab_base_pre
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
@ -489,6 +490,7 @@ class Model:
logger.debug(f"chkhsh: {chkhsh}")
return res
# Marker: End get_vocab_base_pre
def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base()
@ -526,7 +528,7 @@ class Model:
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size):
if i not in reverse_vocab:

View file

@ -1109,7 +1109,7 @@ class OutputFile:
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1]
name = params.path_model.name
elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model
name = "LLaMA v2"

View file

@ -1,6 +1,6 @@
# Debugging Tests Tips
## How to run & debug a specific test without anything else to keep the feedback loop short?
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
It will then build & run in the debugger for you.
To just execute a test and get back a PASS or FAIL message run:
```bash
./scripts/debug-test.sh test-tokenizer
```
To test in GDB use the `-g` flag to enable gdb test mode.
```bash
./scripts/debug-test.sh -g test-tokenizer
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
>>> b main
```
To speed up the testing loop, if you know your test number you can just run it similar to below:
```bash
./scripts/debug-test.sh test 23
```
For further reference use `debug-test.sh -h` to print help.
&nbsp;
@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
make -j
```
#### Step 3.1: Identify Test Command for Debugging
#### Step 3: Find all tests available that matches REGEX
The output of this command will give you the command & arguments needed to run GDB.
@ -69,11 +83,13 @@ Labels: main
...
```
So for test #1 we can tell these two pieces of relevant information:
#### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
#### Step 3.2: Run GDB on test command
#### Step 5: Run GDB on test command
Based on the ctest 'test command' report above we can then run a gdb session via this command below:

View file

@ -49,4 +49,7 @@ else()
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif()

View file

@ -212,6 +212,7 @@ int main(int argc, char ** argv) {
// clean up
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();

View file

@ -7,6 +7,8 @@ android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@ -18,6 +20,17 @@ android {
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
@ -42,6 +55,17 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
@ -54,7 +78,6 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")

View file

@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
@ -101,13 +101,13 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) {
@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context));
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_bench_1model(
Java_com_example_llama_Llm_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
@ -271,13 +271,13 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@ -313,19 +313,19 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
llama_backend_init();
}
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
Java_android_llama_cpp_LLamaAndroid_completion_1init(
Java_com_example_llama_Llm_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
@ -376,7 +376,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
Java_com_example_llama_Llm_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
@ -438,6 +438,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View file

@ -1,4 +1,4 @@
package android.llama.cpp
package com.example.llama
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
class LLamaAndroid {
class Llm {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@ -165,8 +165,8 @@ class LLamaAndroid {
}
// Enforce only one instance of Llm.
private val _instance: LLamaAndroid = LLamaAndroid()
private val _instance: Llm = Llm()
fun instance(): LLamaAndroid = _instance
fun instance(): Llm = _instance
}
}

View file

@ -1,6 +1,5 @@
package com.example.llama
import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
@ -10,7 +9,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
@ -29,7 +28,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
viewModelScope.launch {
try {
llamaAndroid.unload()
llm.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
@ -45,7 +44,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
messages += ""
viewModelScope.launch {
llamaAndroid.send(text)
llm.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
@ -58,7 +57,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val warmupResult = llm.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
@ -71,7 +70,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
return@launch
}
messages += llamaAndroid.bench(512, 128, 1, 3)
messages += llm.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
@ -82,7 +81,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llamaAndroid.load(pathToModel)
llm.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)

View file

@ -2,5 +2,4 @@
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
id("com.android.library") version "8.2.0" apply false
}

View file

@ -1 +0,0 @@
/build

View file

@ -1,21 +0,0 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View file

@ -1,24 +0,0 @@
package android.llama.cpp
import androidx.test.platform.app.InstrumentationRegistry
import androidx.test.ext.junit.runners.AndroidJUnit4
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.Assert.*
/**
* Instrumented test, which will execute on an Android device.
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
@RunWith(AndroidJUnit4::class)
class ExampleInstrumentedTest {
@Test
fun useAppContext() {
// Context of the app under test.
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
assertEquals("android.llama.cpp.test", appContext.packageName)
}
}

View file

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
</manifest>

View file

@ -1,49 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
# Also provides "common"
FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -1,17 +0,0 @@
package android.llama.cpp
import org.junit.Test
import org.junit.Assert.*
/**
* Example local unit test, which will execute on the development machine (host).
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
class ExampleUnitTest {
@Test
fun addition_isCorrect() {
assertEquals(4, 2 + 2)
}
}

View file

@ -15,4 +15,3 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid"
include(":app")
include(":llama")

View file

@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct {
struct ggml_tensor * newline;
struct ggml_context * ctx;
} model;
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) {
LOG_TEE("newline_tmp tensor buffer is NULL\n");
}
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else {
model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) {
LOG_TEE("newline_tmp tensor data is NULL\n");
}
}
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base

View file

@ -1,6 +1,8 @@
# quantize
TODO
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
Note: It is synced from llama.cpp `main` every 6 hours.
## Llama 2 7B

View file

@ -0,0 +1,2 @@
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

74
examples/rpc/README.md Normal file
View file

@ -0,0 +1,74 @@
## Overview
The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
flowchart TD
rpcb---|TCP|srva
rpcb---|TCP|srvb
rpcb-.-|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
end
subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
end
subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
end
subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
```
Each host can run a different backend, e.g. one with CUDA and another with Metal.
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
## Usage
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support:
```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
cmake --build . --config Release
```
Then, start the `rpc-server` with the backend:
```bash
$ bin/rpc-server -p 50052
create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
Starting RPC server on 0.0.0.0:50052
```
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
```bash
mkdir build-rpc
cd build-rpc
cmake .. -DLLAMA_RPC=ON
cmake --build . --config Release
```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

130
examples/rpc/rpc-server.cpp Normal file
View file

@ -0,0 +1,130 @@
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <unistd.h>
#endif
#include <string>
#include <stdio.h>
struct rpc_server_params {
std::string host = "0.0.0.0";
int port = 50052;
size_t backend_mem = 0;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-H" || arg == "--host") {
if (++i >= argc) {
return false;
}
params.host = argv[i];
} else if (arg == "-p" || arg == "--port") {
if (++i >= argc) {
return false;
}
params.port = std::stoi(argv[i]);
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
}
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#elif GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
backend = ggml_backend_metal_init();
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
fprintf(stderr, "%s: using CPU backend\n", __func__);
backend = ggml_backend_cpu_init();
}
return backend;
}
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else
#ifdef _WIN32
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
*total_mem = status.ullTotalPhys;
*free_mem = status.ullAvailPhys;
#else
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
*total_mem = pages * page_size;
*free_mem = *total_mem;
#endif
#endif
}
int main(int argc, char * argv[]) {
rpc_server_params params;
if (!rpc_server_params_parse(argc, argv, params)) {
fprintf(stderr, "Invalid parameters\n");
return 1;
}
ggml_backend_t backend = create_backend();
if (!backend) {
fprintf(stderr, "Failed to create backend\n");
return 1;
}
std::string endpoint = params.host + ":" + std::to_string(params.port);
size_t free_mem, total_mem;
if (params.backend_mem > 0) {
free_mem = params.backend_mem;
total_mem = params.backend_mem;
} else {
get_backend_memory(&free_mem, &total_mem);
}
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View file

@ -17,7 +17,8 @@ The project is under active development, and we are [looking for feedback and co
**Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View file

@ -672,6 +672,13 @@ struct server_context {
model = nullptr;
}
// Clear any sampling context
for (server_slot & slot : slots) {
if (slot.ctx_sampling != nullptr) {
llama_sampling_free(slot.ctx_sampling);
}
}
llama_batch_free(batch);
}
@ -2381,6 +2388,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@ -2433,6 +2441,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
sparams.port = std::stoi(argv[i]);
} else if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers = argv[i];
} else if (arg == "--host") {
if (++i >= argc) {
invalid_param = true;

View file

@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
tensor->buffer = buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
tensor->backend = tensor->view_src->backend;
ggml_backend_buffer_init_tensor(buffer, tensor);
}

View file

@ -2564,7 +2564,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
}
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (cuda_graph_update_required) {
if (use_cuda_graph && cuda_graph_update_required) {
cuda_ctx->cuda_graph->number_consecutive_updates++;
} else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0;

View file

@ -1,35 +1,36 @@
#include "upscale.cuh"
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
// blockIdx.z: idx of ne02*ne03
// blockIdx.y: idx of ne01*scale_factor aka ne1
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
// ne00xne01: ne00 * ne01
int ne0 = ne00 * scale_factor;
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
static __global__ void upscale_f32(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= ne10 * ne11 * ne12 * ne13) {
return;
}
// operation
int i00 = nidx / scale_factor;
int i01 = blockIdx.y / scale_factor;
int offset_src =
i00 +
i01 * ne00 +
blockIdx.z * ne00xne01;
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src];
int i10 = index % ne10;
int i11 = (index / ne10) % ne11;
int i12 = (index / (ne10 * ne11)) % ne12;
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
int i00 = i10 / sf0;
int i01 = i11 / sf1;
int i02 = i12 / sf2;
int i03 = i13 / sf3;
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
}
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
const int scale_factor, cudaStream_t stream) {
int ne0 = (ne00 * scale_factor);
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
static void upscale_f32_cuda(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3,
cudaStream_t stream) {
int dst_size = ne10 * ne11 * ne12 * ne13;
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
}
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -40,9 +41,11 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
const int scale_factor = dst->op_params[0];
const float sf0 = (float)dst->ne[0]/src0->ne[0];
const float sf1 = (float)dst->ne[1]/src0->ne[1];
const float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
}

View file

@ -120,9 +120,16 @@ extern "C" {
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
// 16-bit float

View file

@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) {
while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
nth *= 2;
}
if (use_f16) {
@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
}
} else {
while (nth < ne00 && nth < 1024) {
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
nth *= 2;
}
if (use_f16) {
@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int sf = dst->op_params[0];
const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
@ -2513,12 +2519,13 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_FLASH_ATTN_EXT:
{
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne11 % 32 == 0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
GGML_ASSERT(ggml_are_same_shape (src1, src2));
GGML_ASSERT(src3);
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
size_t offs_src3 = 0;
@ -2528,6 +2535,11 @@ static enum ggml_status ggml_metal_graph_compute(
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
@ -2590,34 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
if (id_src3) {
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
} else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&scale length:sizeof( float) atIndex:26];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
if (!use_vec_kernel) {
// half8x8 kernel

View file

@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & sf,
constant float & sf0,
constant float & sf1,
constant float & sf2,
constant float & sf3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1/sf;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
const int64_t i03 = i3/sf3;
const int64_t i02 = i2/sf2;
const int64_t i01 = i1/sf1;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = src0_ptr[i0/sf];
const int64_t i00 = i0/sf0;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_ptr[0] = src0_ptr[0];
}
}
@ -2049,27 +2054,24 @@ typedef void (flash_attn_ext_f16_t)(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@ -2090,27 +2092,24 @@ kernel void kernel_flash_attn_ext_f16(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@ -2180,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
@ -2247,11 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
}
if (mask != q) {
// mqk = mqk*scale + mask*slope
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply(mm, mslope, mm);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
} else {
// mqk = mqk*scale
simdgroup_multiply(mqk, mscale, mqk);
}
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
}
@ -2425,27 +2425,24 @@ kernel void kernel_flash_attn_ext_vec_f16(
device const char * v,
device const char * mask,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb23,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
@ -2521,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short ne22 = ne12;
const short ne23 = ne13;
const uint nb21 = nb11;
const uint nb22 = nb12;
const uint nb23 = nb13;
// broadcast
const short rk2 = ne02/ne12;
const short rk3 = ne03/ne13;
@ -2589,8 +2582,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
// mqk = mqk*scale + mask*slope
if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc];
mqk = mqk*scale + mm*slope;
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
ss4[cc] = mqk;
}

File diff suppressed because it is too large Load diff

1023
ggml-rpc.cpp Normal file

File diff suppressed because it is too large Load diff

24
ggml-rpc.h Normal file
View file

@ -0,0 +1,24 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_RPC_MAX_SERVERS 16
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus
}
#endif

View file

@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
#pragma message("TODO: generalize upscale operator")
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
GGML_ASSERT(false && "TODO: generalize upscale operator");
const int scale_factor = dst->op_params[0];
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);

931
ggml.c

File diff suppressed because it is too large Load diff

16
ggml.h
View file

@ -572,7 +572,8 @@ extern "C" {
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
enum ggml_backend_type backend;
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
struct ggml_backend_buffer * buffer;
@ -774,6 +775,7 @@ extern "C" {
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
@ -1680,12 +1682,24 @@ extern "C" {
float p1);
// nearest interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
GGML_API struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor);
// nearest interpolate
// nearest interpolate to specified dimensions
// used in tortoise.cpp
GGML_API struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3);
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,

265
llama.cpp
View file

@ -9,6 +9,10 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#ifdef GGML_USE_RPC
# include "ggml-rpc.h"
#endif
#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
@ -1711,91 +1715,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
GGML_UNUSED(host_buffer);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu);
}
return buft;
GGML_UNUSED(tensor_split);
}
static size_t llama_get_device_count() {
#if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
}
static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
GGML_UNUSED(device);
#endif
}
//
// globals
//
@ -2240,6 +2159,8 @@ struct llama_model {
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
@ -2383,6 +2304,104 @@ struct llama_context {
#endif
};
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC
std::string endpoint = model.rpc_servers[gpu];
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
#elif defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(model);
GGML_UNUSED(gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu);
}
return buft;
GGML_UNUSED(tensor_split);
}
static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
size_t total;
size_t free;
std::string endpoint = model.rpc_servers[device];
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
return free;
#elif defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
#endif
GGML_UNUSED(model);
GGML_UNUSED(device);
}
//
// kv cache helpers
//
@ -4861,13 +4880,13 @@ static bool llm_load_tensors(
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
// calculate the split points
int device_count = llama_get_device_count();
int device_count = llama_get_device_count(model);
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
std::vector<float> splits(device_count);
if (all_zero) {
// default split, by free memory
for (int i = 0; i < device_count; ++i) {
splits[i] = llama_get_device_memory(i);
splits[i] = llama_get_device_memory(model, i);
}
} else {
std::copy(tensor_split, tensor_split + device_count, splits.begin());
@ -4887,35 +4906,35 @@ static bool llm_load_tensors(
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
}
// assign the output layer
if (n_gpu_layers > n_layer) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
}
} else {
ggml_backend_buffer_type_t split_buft;
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
} else {
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(main_gpu);
split_buft = llama_default_buffer_type_offload(model, main_gpu);
}
// assign the repeating layers
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = {
split_buft,
llama_default_buffer_type_offload(main_gpu)
llama_default_buffer_type_offload(model, main_gpu)
};
}
// assign the output layer
if (n_gpu_layers > n_layer && !clblast_offload_fallback_mode) {
model.buft_output = {
split_buft,
llama_default_buffer_type_offload(main_gpu)
llama_default_buffer_type_offload(model, main_gpu)
};
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
@ -6673,6 +6692,7 @@ static struct ggml_tensor * llm_build_kqv(
const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
cb(q, "q", il);
@ -6695,8 +6715,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il],
n_embd_head_v, n_kv, n_head_kv,
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
0);
cb(v, "v", il);
@ -6706,7 +6726,7 @@ static struct ggml_tensor * llm_build_kqv(
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
}
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
} else {
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
@ -6751,7 +6771,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il);
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
cb(cur, "kqv_merged_cont", il);
}
@ -13090,6 +13110,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
}
}
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && vocab.special_add_eos == 1) {
GGML_ASSERT(vocab.special_eos_id != -1);
output.push_back(vocab.special_eos_id);
@ -13126,6 +13153,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
}
}
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
if (add_special && vocab.special_add_eos == 1) {
GGML_ASSERT(vocab.special_add_eos != -1);
output.push_back(vocab.special_eos_id);
@ -14200,9 +14234,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
// Sample the next word X using top-k sampling
llama_sample_top_k(nullptr, candidates, int(k), 1);
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token X = llama_sample_token(ctx, candidates);
t_start_sample_us = ggml_time_us();
@ -14216,9 +14248,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
// Update mu using the learning rate and error
*mu = *mu - eta * e;
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
return X;
}
@ -15705,6 +15735,7 @@ struct llama_model_params llama_model_default_params() {
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
@ -15788,7 +15819,7 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
@ -15851,7 +15882,17 @@ struct llama_model * llama_load_model_from_file(
return true;
};
}
if (params.rpc_servers != nullptr) {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
while ((pos = servers.find(",")) != std::string::npos) {
std::string server = servers.substr(0, pos);
model->rpc_servers.push_back(server);
servers.erase(0, pos + 1);
}
model->rpc_servers.push_back(servers);
}
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
@ -15998,7 +16039,17 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) {
// initialize backends
#ifdef GGML_USE_METAL
#if defined(GGML_USE_RPC)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
@ -16155,7 +16206,7 @@ struct llama_context * llama_new_context_with_model(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
llama_get_device_count() > 1 &&
llama_get_device_count(*model) > 1 &&
model->n_gpu_layers > (int)model->hparams.n_layer &&
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv;
@ -17278,13 +17329,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
}
else {
if (cell_range_begin != kv_self.size) {
cell_ranges.push_back({ cell_range_begin, i });
cell_ranges.emplace_back(cell_range_begin, i);
cell_range_begin = kv_self.size;
}
}
}
if (cell_range_begin != kv_self.size) {
cell_ranges.push_back({ cell_range_begin, kv_self.size });
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
}
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count

View file

@ -242,6 +242,9 @@ extern "C" {
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
// comma separated list of RPC servers to use for offloading
const char * rpc_servers;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.

View file

@ -1,45 +1,142 @@
#!/bin/bash
test_suite=${1:-}
test_number=${2:-}
PROG=${0##*/}
build_dir="build-ci-debug"
# Print Color Commands
red=$(tput setaf 1)
green=$(tput setaf 2)
yellow=$(tput setaf 3)
blue=$(tput setaf 4)
magenta=$(tput setaf 5)
cyan=$(tput setaf 6)
normal=$(tput sgr0)
# Print Help Message
####################
print_full_help() {
cat << EOF
Usage: $PROG [OPTION]... <test_regex> (test_number)
Debug specific ctest program.
Options:
-h, --help display this help and exit
-g run in gdb mode
Arguments:
<test_regex> (Mandatory) Supply one regex to the script to filter tests
(test_number) (Optional) Test number to run a specific test
Example:
$PROG test-tokenizer
$PROG test-tokenizer 3
EOF
}
abort() {
echo "Error: $1" >&2
cat << EOF >&2
Usage: $PROG [OPTION]... <test_regex> (test_number)
Debug specific ctest program.
Refer to --help for full instructions.
EOF
exit 1
}
# Dependency Sanity Check
#########################
check_dependency() {
command -v "$1" >/dev/null 2>&1 || {
abort "$1 is required but not found. Please install it and try again."
}
}
check_dependency ctest
check_dependency cmake
# Step 0: Check the args
########################
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
echo "Debug specific ctest program."
echo
echo "Options:"
echo " -h, --help Display this help and exit"
echo
echo "Arguments:"
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
echo " (test_number) (Optional) Test number to run a specific test"
echo
echo "Example:"
echo " $PROG test-tokenizer"
echo " $PROG test-tokenizer 3"
echo
print_full_help >&2
exit 0
fi
# Function to select and debug a test
function select_test() {
test_suite=${1:-test}
test_number=${2:-}
# Parse command-line options
gdb_mode=false
while getopts "g" opt; do
case $opt in
g)
gdb_mode=true
echo "gdb_mode Mode Enabled"
;;
esac
done
# Sanity Check If Tests Is Detected
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
if [ ${#tests[@]} -eq 0 ]
then
echo "No tests avaliable... check your compliation process..."
echo "Exiting."
exit 1
# Shift the option parameters
shift $((OPTIND - 1))
# Positionial Argument Processing : <test_regex>
if [ -z "${1}" ]; then
abort "Test regex is required"
else
test_suite=${1:-}
fi
if [ -z $test_number ]
then
# Positionial Argument Processing : (test_number)
test_number=${2:-}
# Step 1: Reset and Setup folder context
########################################
## Sanity check that we are actually in a git repo
repo_root=$(git rev-parse --show-toplevel)
if [ ! -d "$repo_root" ]; then
abort "Not in a Git repository."
fi
## Reset folder to root context of git repo and Create and enter build directory
pushd "$repo_root"
rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
# Step 2: Setup Build Environment and Compile Test Binaries
###########################################################
# Note: test-eval-callback requires -DLLAMA_CURL
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
pushd "$build_dir"
make -j || abort "Failed to compile"
popd > /dev/null || exit 1
# Step 3: Find all tests available that matches REGEX
####################################################
# Ctest Gather Tests
# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
# `-V` : Verbose Mode
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
pushd "$build_dir"
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
if [ ${#tests[@]} -eq 0 ]; then
abort "No tests avaliable... check your compliation process..."
fi
popd > /dev/null || exit 1
# Step 4: Identify Test Command for Debugging
#############################################
# Select test number
if [ -z $test_number ]; then
# List out avaliable tests
printf "Which test would you like to debug?\n"
id=0
@ -53,65 +150,54 @@ function select_test() {
# Prompt user which test they wanted to run
printf "\nRun test#? "
read test_number
else
printf "\nUser Already Requested #${test_number}"
printf "\nUser Already Requested #${test_number}\n"
fi
# Start GDB with the requested test binary and arguments
printf "Debugging(GDB) test: ${tests[test_number]}\n"
# Change IFS (Internal Field Separator)
sIFS=$IFS
IFS=$'\n'
# Grab all tests commands
pushd "$build_dir"
sIFS=$IFS # Save Initial IFS (Internal Field Separator)
IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
IFS=$sIFS # Reset IFS (Internal Field Separator)
popd > /dev/null || exit 1
# Get test args
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
IFS=$sIFS
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
# Grab specific test command
single_test_name="${tests[test_number]}"
single_test_command="${test_args[test_number]}"
# Expand paths if needed
args=()
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
do
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
done
# Step 5: Execute or GDB Debug
##############################
printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
if [ "$gdb_mode" = "true" ]; then
# Execute debugger
echo "gdb args: ${args[@]}"
gdb --args ${args[@]}
}
# Step 0: Check the args
if [ -z "$test_suite" ]
then
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
echo "Supply one regex to the script to filter tests,"
echo "and optionally a test number to run a specific test."
echo "Use --help flag for full instructions"
exit 1
fi
# Step 1: Reset and Setup folder context
## Sanity check that we are actually in a git repo
repo_root=$(git rev-parse --show-toplevel)
if [ ! -d "$repo_root" ]; then
echo "Error: Not in a Git repository."
exit 1
fi
## Reset folder to root context of git repo
pushd "$repo_root" || exit 1
eval "gdb --args ${single_test_command}"
popd > /dev/null || exit 1
## Create and enter build directory
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
else
# Execute Test
pushd "$repo_root" || exit 1
eval "${single_test_command}"
exit_code=$?
popd > /dev/null || exit 1
# Step 2: Setup Build Environment and Compile Test Binaries
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
pushd "$build_dir" && make -j || exit 1
# Print Result
printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
printf "${yellow}Command: ${single_test_command}${normal}\n"
if [ $exit_code -eq 0 ]; then
printf "${green}TEST PASS${normal}\n"
else
printf "${red}TEST FAIL${normal}\n"
fi
# Step 3: Debug the Test
select_test "$test_suite" "$test_number"
fi
# Step 4: Return to the directory from which the user ran the command.
popd || exit 1
popd || exit 1
popd || exit 1
# Return to the directory from which the user ran the command.
popd > /dev/null || exit 1