Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	.github/labeler.yml
#	.gitignore
#	CMakeLists.txt
#	Makefile
#	Package.swift
#	README.md
#	ci/run.sh
#	docs/build.md
#	examples/CMakeLists.txt
#	flake.lock
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	grammars/README.md
#	requirements/requirements-convert_hf_to_gguf.txt
#	requirements/requirements-convert_hf_to_gguf_update.txt
#	scripts/check-requirements.sh
#	scripts/compare-llama-bench.py
#	scripts/gen-unicode-data.py
#	scripts/sync-ggml-am.sh
#	scripts/sync-ggml.last
#	scripts/sync-ggml.sh
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
#	tests/test-tokenizer-random.py
This commit is contained in:
Concedo 2024-07-11 16:36:16 +08:00
commit 2cad736260
85 changed files with 12568 additions and 445 deletions

38
.github/workflows/python-type-check.yml vendored Normal file
View file

@ -0,0 +1,38 @@
name: Python Type-Check
on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
steps:
- name: Check out source repository
uses: actions/checkout@v4
- name: Set up Python environment
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python dependencies
# TODO: use a venv
run: pip install -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.370
level: warning
warnings: true

View file

@ -430,8 +430,10 @@ add_library(ggml
ggml/include/ggml-backend.h ggml/include/ggml-backend.h
ggml/src/ggml-quants.c ggml/src/ggml-quants.c
ggml/src/ggml-quants.h ggml/src/ggml-quants.h
ggml/src/sgemm.cpp ggml/src/llamafile/sgemm.cpp
ggml/src/sgemm.h ggml/src/llamafile/sgemm.h
ggml/src/ggml-aarch64.c
ggml/src/ggml-aarch64.h
${GGML_SOURCES_CUDA}) ${GGML_SOURCES_CUDA})
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools) target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features(ggml PUBLIC c_std_11) # don't bump

View file

@ -65,9 +65,9 @@ endif
CUBLASLD_FLAGS = CUBLASLD_FLAGS =
CUBLAS_OBJS = CUBLAS_OBJS =
OBJS_FULL += ggml-alloc.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o grammar-parser.o OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o grammar-parser.o
OBJS_SIMPLE += ggml-alloc.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o grammar-parser.o OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o grammar-parser.o
OBJS_FAILSAFE += ggml-alloc.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o grammar-parser.o OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o grammar-parser.o
#lets try enabling everything #lets try enabling everything
CFLAGS += -pthread -s -Wno-deprecated -Wno-deprecated-declarations CFLAGS += -pthread -s -Wno-deprecated -Wno-deprecated-declarations
@ -421,11 +421,11 @@ ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
#sgemm #sgemm
sgemm.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h sgemm.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
sgemm_noavx2.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h sgemm_noavx2.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
sgemm_failsafe.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h sgemm_failsafe.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
#there's no intrinsics or special gpu ops used here, so we can have a universal object #there's no intrinsics or special gpu ops used here, so we can have a universal object
@ -437,6 +437,8 @@ unicode.o: src/unicode.cpp src/unicode.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
unicode-data.o: src/unicode-data.cpp src/unicode-data.h unicode-data.o: src/unicode-data.cpp src/unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
ggml-aarch64.o: ggml/src/ggml-aarch64.c ggml/include/ggml.h ggml/src/ggml-aarch64.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
#these have special gpu defines #these have special gpu defines
ggml-backend_default.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h ggml-backend_default.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h

View file

@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "common.h" #include "common.h"
#include "build-info.h" #include "build-info.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
@ -191,6 +195,12 @@ int32_t cpu_get_num_math() {
// CLI argument parsing // CLI argument parsing
// //
void gpt_params_handle_hf_token(gpt_params & params) {
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
params.hf_token = std::getenv("HF_TOKEN");
}
}
void gpt_params_handle_model_default(gpt_params & params) { void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) { if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
@ -238,6 +248,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
gpt_params_handle_model_default(params); gpt_params_handle_model_default(params);
gpt_params_handle_hf_token(params);
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
string_process_escapes(params.input_prefix); string_process_escapes(params.input_prefix);
@ -653,6 +665,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.model_url = argv[i]; params.model_url = argv[i];
return true; return true;
} }
if (arg == "-hft" || arg == "--hf-token") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.hf_token = argv[i];
return true;
}
if (arg == "-hfr" || arg == "--hf-repo") { if (arg == "-hfr" || arg == "--hf-repo") {
CHECK_ARG CHECK_ARG
params.hf_repo = argv[i]; params.hf_repo = argv[i];
@ -1577,6 +1597,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
options.push_back({ "retrieval" }); options.push_back({ "retrieval" });
options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
@ -2016,9 +2037,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
llama_model * model = nullptr; llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) { if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else { } else {
model = llama_load_model_from_file(params.model.c_str(), mparams); model = llama_load_model_from_file(params.model.c_str(), mparams);
} }
@ -2206,7 +2227,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool llama_download_file(const std::string & url, const std::string & path) { static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -2221,6 +2242,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer ";
auth_header += hf_token.c_str();
struct curl_slist *http_headers = NULL;
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
}
#if defined(_WIN32) #if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows. // operating system. Currently implemented under MS-Windows.
@ -2416,6 +2446,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
struct llama_model * llama_load_model_from_url( struct llama_model * llama_load_model_from_url(
const char * model_url, const char * model_url,
const char * path_model, const char * path_model,
const char * hf_token,
const struct llama_model_params & params) { const struct llama_model_params & params) {
// Basic validation of the model_url // Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) { if (!model_url || strlen(model_url) == 0) {
@ -2423,7 +2454,7 @@ struct llama_model * llama_load_model_from_url(
return NULL; return NULL;
} }
if (!llama_download_file(model_url, path_model)) { if (!llama_download_file(model_url, path_model, hf_token)) {
return NULL; return NULL;
} }
@ -2471,14 +2502,14 @@ struct llama_model * llama_load_model_from_url(
// Prepare download in parallel // Prepare download in parallel
std::vector<std::future<bool>> futures_download; std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) { for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0}; char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return llama_download_file(split_url, split_path); return llama_download_file(split_url, split_path, hf_token);
}, idx)); }, idx));
} }
@ -2497,6 +2528,7 @@ struct llama_model * llama_load_model_from_hf(
const char * repo, const char * repo,
const char * model, const char * model,
const char * path_model, const char * path_model,
const char * hf_token,
const struct llama_model_params & params) { const struct llama_model_params & params) {
// construct hugging face model url: // construct hugging face model url:
// //
@ -2512,7 +2544,7 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/"; model_url += "/resolve/main/";
model_url += model; model_url += model;
return llama_load_model_from_url(model_url.c_str(), path_model, params); return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
} }
#else #else
@ -2520,6 +2552,7 @@ struct llama_model * llama_load_model_from_hf(
struct llama_model * llama_load_model_from_url( struct llama_model * llama_load_model_from_url(
const char * /*model_url*/, const char * /*model_url*/,
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr; return nullptr;
@ -2529,6 +2562,7 @@ struct llama_model * llama_load_model_from_hf(
const char * /*repo*/, const char * /*repo*/,
const char * /*model*/, const char * /*model*/,
const char * /*path_model*/, const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) { const struct llama_model_params & /*params*/) {
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr; return nullptr;

View file

@ -125,6 +125,7 @@ struct gpt_params {
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download
std::string hf_token = ""; // HF token
std::string hf_repo = ""; // HF repo std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file std::string hf_file = ""; // HF file
std::string prompt = ""; std::string prompt = "";
@ -273,6 +274,7 @@ struct gpt_params {
bool spm_infill = false; // suffix/prefix/middle pattern for infill bool spm_infill = false; // suffix/prefix/middle pattern for infill
}; };
void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@ -328,8 +330,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// Batch utils // Batch utils

View file

@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
GGML_ASSERT(!original_logits.empty()); GGML_ASSERT(!original_logits.empty());
} }
llama_token id = 0; llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
if (temp < 0.0) { if (temp < 0.0) {
// greedy sampling, with probs // greedy sampling, with probs
@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
} }
if (ctx_sampling->grammar != NULL && !is_resampling) { if (ctx_sampling->grammar != NULL && !is_resampling) {
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);
// Create an array with a single token data element for the sampled id // Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f}; llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) { if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL); GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; *original_logits = {logits, logits + n_vocab};
} }
// apply params.logit_bias map // apply params.logit_bias map
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
} }
cur.clear(); cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
} }
llama_token_data_array cur_p = { cur.data(), cur.size(), false }; llama_token_data_array cur_p = { cur.data(), cur.size(), false };

View file

@ -265,7 +265,7 @@ class Model:
break break
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
data: np.ndarray = data # type hint data: np.ndarray # type hint
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
data_qtype: gguf.GGMLQuantizationType | None = None data_qtype: gguf.GGMLQuantizationType | None = None
@ -487,6 +487,9 @@ class Model:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code" res = "jina-v2-code"
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B # ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking" res = "viking"
@ -596,10 +599,6 @@ class Model:
tokenizer_path = self.dir_model / 'tokenizer.model' tokenizer_path = self.dir_model / 'tokenizer.model'
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}") raise FileNotFoundError(f"File not found: {tokenizer_path}")
@ -2117,7 +2116,7 @@ class InternLM2Model(Model):
logger.error(f'Error: Missing {tokenizer_path}') logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1) sys.exit(1)
sentencepiece_model = model.ModelProto() sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -2145,6 +2144,9 @@ class InternLM2Model(Model):
toktype = SentencePieceTokenTypes.UNUSED toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = SentencePieceTokenTypes.BYTE
# take care of ununsed raw token
if piece.startswith('[UNUSED'):
toktype = SentencePieceTokenTypes.UNKNOWN
tokens.append(text) tokens.append(text)
scores.append(score) scores.append(score)
@ -2160,6 +2162,47 @@ class InternLM2Model(Model):
scores.append(-1000.0) scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED) toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
chat_eos_token = '<|im_end|>'
chat_eos_token_id = None
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, foken_data in added_tokens_decoder.items():
token_id = int(token_id)
token = foken_data["content"]
if token == chat_eos_token:
chat_eos_token_id = token_id
token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
tokenizer_file = self.dir_model / 'tokenizer.json'
if tokenizer_file.is_file():
with open(tokenizer_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get("added_tokens", [])
for foken_data in added_tokens:
token_id = int(foken_data["id"])
token = foken_data["content"]
if token == chat_eos_token:
chat_eos_token_id = token_id
token = token.encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
@ -2169,28 +2212,16 @@ class InternLM2Model(Model):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
old_eos = special_vocab.special_token_ids["eos"] old_eos = special_vocab.special_token_ids["eos"]
if "chat" in os.path.basename(self.dir_model.absolute()): if chat_eos_token_id is not None:
# For the chat model, we replace the eos with '<|im_end|>'. # For the chat model, we replace the eos with '<|im_end|>'.
# TODO: this is a hack, should be fixed # TODO: this is a hack, should be fixed
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) special_vocab.special_token_ids["eos"] = chat_eos_token_id
logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
in chat mode so that the conversation can end normally.") " in chat mode so that the conversation can end normally.")
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _try_get_sft_eos(self, tokenizer):
unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
im_end_list = tokenizer.Encode('<|im_end|>')
eos_token = None
assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
if len(unused_145_list) == 1:
eos_token = unused_145_list[0]
if len(im_end_list) == 1:
eos_token = im_end_list[0]
assert eos_token
return eos_token
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv n_head = n_head_kv
@ -2209,6 +2240,10 @@ in chat mode so that the conversation can end normally.")
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"] num_heads = self.hparams["num_attention_heads"]
@ -2969,7 +3004,7 @@ class T5Model(Model):
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}") raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -3149,7 +3184,7 @@ class JaisModel(Model):
# but Jais's PyTorch model simply precalculates the slope values and places them # but Jais's PyTorch model simply precalculates the slope values and places them
# in relative_pes.slopes # in relative_pes.slopes
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
first_val = float(data_torch._data[0]) first_val = float(data_torch[0].item())
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
return tensors return tensors
@ -3176,6 +3211,190 @@ class JaisModel(Model):
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(Model):
model_arch = gguf.MODEL_ARCH.CHATGLM
def set_vocab_chatglm3(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytes] = []
toktypes: list[int] = []
scores: list[float] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
assert max(tokenizer.get_vocab().values()) < vocab_size
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
for token_id in range(vocab_size):
piece = tokenizer._convert_id_to_token(token_id)
if token_id == 0:
piece = "<unk>"
elif token_id == 1:
piece = "<bos>"
elif token_id == 2:
piece = "<eos>"
text = piece.encode("utf-8")
score = 0.0
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
score = tokenizer.tokenizer.sp_model.get_score(token_id)
if len(piece) == 0:
text = f"[PAD{token_id}]".encode("utf-8")
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
if piece in special_tokens:
# show special tokens in prompt
toktype = SentencePieceTokenTypes.USER_DEFINED
else:
toktype = SentencePieceTokenTypes.UNKNOWN
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
continue
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.tokenizer.sp_model.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.tokenizer.sp_model.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.tokenizer.sp_model.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.tokenizer.sp_model.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
self.gguf_writer.add_tokenizer_model("llama")
# glm3 needs prefix and suffix formatted as:
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
@staticmethod
def token_bytes_to_string(b):
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_encoder = bytes_to_unicode()
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
@staticmethod
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
parts = [bytes([b]) for b in token]
while True:
min_idx = None
min_rank = None
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
rank = mergeable_ranks.get(pair[0] + pair[1])
if rank is not None and (min_rank is None or rank < min_rank):
min_idx = i
min_rank = rank
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
break
assert min_idx is not None
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
return parts
def set_vocab(self):
if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
self.set_vocab_chatglm3()
return
dir_model = self.dir_model
hparams = self.hparams
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["padded_vocab_size"]
assert max(tokenizer.get_vocab().values()) < vocab_size
tokpre = self.get_vocab_base_pre(tokenizer)
merges = []
vocab = {}
mergeable_ranks = tokenizer.mergeable_ranks
for token, rank in mergeable_ranks.items():
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) >= 2 and len(merged) <= 7
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
self.gguf_writer.add_embedding_length(n_embed)
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
self.gguf_writer.add_block_count(self.hparams["num_layers"])
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_rope_dimension_count(64)
self.gguf_writer.add_add_bos_token(False)
rope_freq = 10000
if "rope_ratio" in self.hparams:
rope_freq = rope_freq * self.hparams["rope_ratio"]
self.gguf_writer.add_rope_freq_base(rope_freq)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.endswith(".rotary_pos_emb.inv_freq"):
return []
name = name.removeprefix("transformer.")
return [(self.map_tensor_name(name), data_torch)]
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######

View file

@ -354,7 +354,8 @@ class GGMLToGGUF:
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import convert import examples.convert_legacy_llama as convert
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory' assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
hf_config_path = cfg.model_metadata_dir / "config.json" hf_config_path = cfg.model_metadata_dir / "config.json"
orig_config_path = cfg.model_metadata_dir / "params.json" orig_config_path = cfg.model_metadata_dir / "params.json"

View file

@ -353,7 +353,7 @@ class Metadata:
version: Optional[str] = None version: Optional[str] = None
url: Optional[str] = None url: Optional[str] = None
description: Optional[str] = None description: Optional[str] = None
licence: Optional[str] = None license: Optional[str] = None
source_url: Optional[str] = None source_url: Optional[str] = None
source_hf_repo: Optional[str] = None source_hf_repo: Optional[str] = None
@ -492,12 +492,13 @@ class LazyTensor:
LazyModel: TypeAlias = 'dict[str, LazyTensor]' LazyModel: TypeAlias = 'dict[str, LazyTensor]'
ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
@dataclass @dataclass
class ModelPlus: class ModelPlus:
model: LazyModel model: LazyModel
paths: list[Path] # Where this was read from. paths: list[Path] # Where this was read from.
format: Literal['ggml', 'torch', 'safetensors', 'none'] format: ModelFormat
vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab. vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
@ -536,7 +537,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
formats = set(mp.format for mp in models_plus) formats: set[ModelFormat] = set(mp.format for mp in models_plus)
assert len(formats) == 1, "different formats?" assert len(formats) == 1, "different formats?"
format = formats.pop() format = formats.pop()
paths = [path for mp in models_plus for path in mp.paths] paths = [path for mp in models_plus for path in mp.paths]
@ -555,7 +556,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
else: else:
model = merge_sharded([mp.model for mp in models_plus]) model = merge_sharded([mp.model for mp in models_plus])
return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types return ModelPlus(model, paths, format, vocab)
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -805,7 +806,7 @@ class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_model(self, params: Params, metadata: Metadata) -> None: def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
# Metadata About The Model And Its Provenence # Metadata About The Model And Its Provenence
name = "LLaMA" name = "LLaMA"
if metadata is not None and metadata.name is not None: if metadata is not None and metadata.name is not None:
@ -827,8 +828,8 @@ class OutputFile:
self.gguf.add_url(metadata.url) self.gguf.add_url(metadata.url)
if metadata.description is not None: if metadata.description is not None:
self.gguf.add_description(metadata.description) self.gguf.add_description(metadata.description)
if metadata.licence is not None: if metadata.license is not None:
self.gguf.add_licence(metadata.licence) self.gguf.add_licence(metadata.license)
if metadata.source_url is not None: if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url) self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None: if metadata.source_hf_repo is not None:
@ -943,7 +944,7 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only( def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -977,7 +978,7 @@ class OutputFile:
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, pad_vocab: bool = False,
metadata: Metadata = None, metadata: Metadata | None = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1396,6 +1397,8 @@ def main(args_in: list[str] | None = None) -> None:
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab vocab = model_plus.vocab
assert params is not None
logger.info(f"Vocab info: {vocab}") logger.info(f"Vocab info: {vocab}")
logger.info(f"Special vocab info: {special_vocab}") logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model

View file

@ -0,0 +1,51 @@
# Migration notice for binary filenames
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
Please update all scripts and workflows to use the new binary names.
| Old Filename | New Filename |
| ---- | ---- |
| main | llama-cli |
| server | llama-server |
| llama-bench | llama-bench |
| embedding | llama-embedding |
| finetune | llama-finetune |
| quantize | llama-quantize |
| tokenize | llama-tokenize |
| export-lora | llama-export-lora |
| libllava.a | libllava.a |
| baby-llama | llama-baby-llama |
| batched | llama-batched |
| batched-bench | llama-batched-bench |
| benchmark-matmult | llama-benchmark-matmult |
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
| eval-callback | llama-eval-callback |
| gbnf-validator | llama-gbnf-validator |
| gguf | llama-gguf |
| gguf-split | llama-gguf-split |
| gritlm | llama-gritlm |
| imatrix | llama-imatrix |
| infill | llama-infill |
| llava-cli | llama-llava-cli |
| lookahead | llama-lookahead |
| lookup | llama-lookup |
| lookup-create | llama-lookup-create |
| lookup-merge | llama-lookup-merge |
| lookup-stats | llama-lookup-stats |
| parallel | llama-parallel |
| passkey | llama-passkey |
| perplexity | llama-perplexity |
| q8dot | llama-q8dot |
| quantize-stats | llama-quantize-stats |
| retrieval | llama-retrieval |
| save-load-state | llama-save-load-state |
| simple | llama-simple |
| speculative | llama-speculative |
| train-text-from-scratch | llama-train-text-from-scratch |
| vdot | llama-vdot |
| tests/test-c.o | tests/test-c.o |

View file

@ -0,0 +1,35 @@
// Warns users that this filename was deprecated, and provides a link for more information.
#include <cstdio>
#include <string>
#include <unordered_map>
// Main
int main(int argc, char** argv) {
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];
}
// Get only the program name from the full path
auto pos = filename.find_last_of('/');
if (pos != std::string::npos) {
filename = filename.substr(pos+1);
}
// Append "llama-" to the beginning of filename to get the replacemnt filename
auto replacement_filename = "llama-" + filename;
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
if (filename == "main") {
replacement_filename = "llama-cli";
}
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;
}

View file

@ -87,4 +87,4 @@ The LORA rank can be configured for each model tensor type separately with these
The LORA rank of 'norm' tensors should always be 1. The LORA rank of 'norm' tensors should always be 1.
To see all available options use `finetune --help`. To see all available options use `llama-finetune --help`.

View file

@ -74,7 +74,7 @@ class Tensor:
if len(self.ne) == 0: if len(self.ne) == 0:
self.nbytes = 0 self.nbytes = 0
else: else:
self.nbytes = int(np.product(self.ne)) * 4 self.nbytes = int(np.prod(self.ne)) * 4
else: else:
raise ValueError(f"Unhandled data type '{self.dtype}'") raise ValueError(f"Unhandled data type '{self.dtype}'")

View file

@ -8,7 +8,7 @@ if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing. MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
while getopts "dg" opt; do while getopts "dg" opt; do
case $opt in case $opt in

View file

@ -0,0 +1,15 @@
set(TARGET llama-gguf-hash)
add_executable(${TARGET} gguf-hash.cpp)
install(TARGETS ${TARGET} RUNTIME)
# clibs dependencies
include_directories(deps/)
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
target_link_libraries(${TARGET} PRIVATE xxhash)
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
target_link_libraries(${TARGET} PRIVATE sha1)
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
target_link_libraries(${TARGET} PRIVATE sha256)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,206 @@
# llama-gguf-hash
CLI to hash GGUF files to detect difference on a per model and per tensor level.
**Command line options:**
- `--help`: display help message
- `--xxh64`: use xhash 64bit hash mode (default)
- `--sha1`: use sha1
- `--uuid`: use uuid
- `--sha256`: use sha256
- `--all`: use all hash
- `--no-layer`: exclude per layer hash
- `--uuid`: generate UUIDv5 ID
- `-c`, `--check <manifest>`: verify against a manifest
## About
While most POSIX systems already have hash checking programs like sha256sum, it
is designed to check entire files. This is not ideal for our purpose if we want
to check for consistency of the tensor data even if the metadata content of the
gguf KV store has been updated.
This program is designed to hash a gguf tensor payload on a 'per tensor layer'
in addition to a 'entire tensor model' hash. The intent is that the entire
tensor layer can be checked first but if there is any detected inconsistencies,
then the per tensor hash can be used to narrow down the specific tensor layer
that has inconsistencies.
For Maintainers:
- Detection of tensor inconsistency during development and automated tests
- This is served by xxh64 which is fast
- This is also served by having per tensor layer to assist in narrowing down
the location of the faulty tensor layer
- This is also served by sha1 which is much slower but more widely supported
For Model Creators:
- Optional consistent UUID generation based on model tensor content
- This is served by UUIDv5 which is useful for databases keys
- llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
- Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
For Model Users:
- Assurance of tensor layer integrity even if metadata was updated
- This is served by sha256 which is still considered very secure as of 2024
### Design Note
- The default behavior of this program if no arguments is provided is to hash
using xxhash's xxh32 mode because it is very fast and is primarily targeted
towards maintainers who may want to use this in automated tests.
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
would have a better affinity to calculating hash that is 64bit in size.
## Compile Example
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
make -C build clean
make -C build llama-gguf-hash VERBOSE=1
./build/bin/llama-gguf-hash test.gguf
./build/bin/llama-gguf-hash --xxh64 test.gguf
./build/bin/llama-gguf-hash --sha1 test.gguf
./build/bin/llama-gguf-hash --uuid test.gguf
./build/bin/llama-gguf-hash --sha256 test.gguf
```
## Generation and Verification Example
To generate we may use this command
```bash
./llama-gguf-hash --all test.gguf > test.gguf.manifest
```
Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
(This excludes UUID as that is an ID not a hash)
```bash
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1
xxh64 a0af5d700049693b test.gguf:tensor_2
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3
xxh64 1257733306b7992d test.gguf:tensor_4
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4
xxh64 d238d16ba4711e58 test.gguf:tensor_5
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6
xxh64 c22021c29854f093 test.gguf:tensor_7
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7
xxh64 936df61f5d64261f test.gguf:tensor_8
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8
xxh64 93fd20c64421c081 test.gguf:tensor_9
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9
xxh64 5a54d3aad816f302 test.gguf
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf
```
We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or we may explicitly ask for a faster hash like:
```bash
$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
Or maybe we want to just check that all the hash is valid:
```bash
$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
manifest test.gguf.manifest sha256 sha1 xxh64
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
xxh64 5a54d3aad816f302 test.gguf - Ok
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
Verification results for test.gguf.manifest - Success
```
## Crypto/Hash Libraries Used
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c

View file

@ -0,0 +1,13 @@
{
"name": "rotate-bits",
"version": "0.1.1",
"repo": "jb55/rotate-bits.h",
"description": "rotate bits",
"keywords": ["rotl", "rotr"],
"src": ["rotate-bits.h"],
"license": "Public Domain",
"development": {
"thlorenz/tap.c": "*"
}
}

View file

@ -0,0 +1,46 @@
#ifndef __ROTATE_DEFS_H
#define __ROTATE_DEFS_H
#ifdef _MSC_VER
#include <stdlib.h>
#define ROTL32(v, n) _rotl((v), (n))
#define ROTL64(v, n) _rotl64((v), (n))
#define ROTR32(v, n) _rotr((v), (n))
#define ROTR64(v, n) _rotr64((v), (n))
#else
#include <stdint.h>
#define U8V(v) ((uint8_t)(v) & 0xFFU)
#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
#define ROTL32(v, n) \
(U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
// tests fail if we don't have this cast...
#define ROTL64(v, n) \
(U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
#define ROTR32(v, n) ROTL32(v, 32 - (n))
#define ROTR64(v, n) ROTL64(v, 64 - (n))
#endif
#define ROTL8(v, n) \
(U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
#define ROTL16(v, n) \
(U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
#define ROTR8(v, n) ROTL8(v, 8 - (n))
#define ROTR16(v, n) ROTL16(v, 16 - (n))
#endif

View file

@ -0,0 +1,9 @@
{
"name": "sha1",
"version": "0.0.1",
"repo": "clibs/sha1",
"description": "sha1 hash algorithm",
"keywords": ["sha1", "hash"],
"license": "public domain",
"src": ["sha1.c", "sha1.h"]
}

View file

@ -0,0 +1,295 @@
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
Test Vectors (from FIPS PUB 180-1)
"abc"
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
A million repetitions of "a"
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
*/
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
/* #define SHA1HANDSOFF * Copies data before messing with it. */
#define SHA1HANDSOFF
#include <stdio.h>
#include <string.h>
/* for uint32_t */
#include <stdint.h>
#include "sha1.h"
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
|(rol(block->l[i],8)&0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#define blk0(i) block->l[i]
#else
#error "Endianness not defined!"
#endif
#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
^block->l[(i+2)&15]^block->l[i&15],1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
/* Hash a single 512-bit block. This is the core of the algorithm. */
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
)
{
uint32_t a, b, c, d, e;
typedef union
{
unsigned char c[64];
uint32_t l[16];
} CHAR64LONG16;
#ifdef SHA1HANDSOFF
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
memcpy(block, buffer, 64);
#else
/* The following had better never be used because it causes the
* pointer-to-const buffer to be cast into a pointer to non-const.
* And the result is written through. I threw a "const" in, hoping
* this will cause a diagnostic.
*/
CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
#endif
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a, b, c, d, e, 0);
R0(e, a, b, c, d, 1);
R0(d, e, a, b, c, 2);
R0(c, d, e, a, b, 3);
R0(b, c, d, e, a, 4);
R0(a, b, c, d, e, 5);
R0(e, a, b, c, d, 6);
R0(d, e, a, b, c, 7);
R0(c, d, e, a, b, 8);
R0(b, c, d, e, a, 9);
R0(a, b, c, d, e, 10);
R0(e, a, b, c, d, 11);
R0(d, e, a, b, c, 12);
R0(c, d, e, a, b, 13);
R0(b, c, d, e, a, 14);
R0(a, b, c, d, e, 15);
R1(e, a, b, c, d, 16);
R1(d, e, a, b, c, 17);
R1(c, d, e, a, b, 18);
R1(b, c, d, e, a, 19);
R2(a, b, c, d, e, 20);
R2(e, a, b, c, d, 21);
R2(d, e, a, b, c, 22);
R2(c, d, e, a, b, 23);
R2(b, c, d, e, a, 24);
R2(a, b, c, d, e, 25);
R2(e, a, b, c, d, 26);
R2(d, e, a, b, c, 27);
R2(c, d, e, a, b, 28);
R2(b, c, d, e, a, 29);
R2(a, b, c, d, e, 30);
R2(e, a, b, c, d, 31);
R2(d, e, a, b, c, 32);
R2(c, d, e, a, b, 33);
R2(b, c, d, e, a, 34);
R2(a, b, c, d, e, 35);
R2(e, a, b, c, d, 36);
R2(d, e, a, b, c, 37);
R2(c, d, e, a, b, 38);
R2(b, c, d, e, a, 39);
R3(a, b, c, d, e, 40);
R3(e, a, b, c, d, 41);
R3(d, e, a, b, c, 42);
R3(c, d, e, a, b, 43);
R3(b, c, d, e, a, 44);
R3(a, b, c, d, e, 45);
R3(e, a, b, c, d, 46);
R3(d, e, a, b, c, 47);
R3(c, d, e, a, b, 48);
R3(b, c, d, e, a, 49);
R3(a, b, c, d, e, 50);
R3(e, a, b, c, d, 51);
R3(d, e, a, b, c, 52);
R3(c, d, e, a, b, 53);
R3(b, c, d, e, a, 54);
R3(a, b, c, d, e, 55);
R3(e, a, b, c, d, 56);
R3(d, e, a, b, c, 57);
R3(c, d, e, a, b, 58);
R3(b, c, d, e, a, 59);
R4(a, b, c, d, e, 60);
R4(e, a, b, c, d, 61);
R4(d, e, a, b, c, 62);
R4(c, d, e, a, b, 63);
R4(b, c, d, e, a, 64);
R4(a, b, c, d, e, 65);
R4(e, a, b, c, d, 66);
R4(d, e, a, b, c, 67);
R4(c, d, e, a, b, 68);
R4(b, c, d, e, a, 69);
R4(a, b, c, d, e, 70);
R4(e, a, b, c, d, 71);
R4(d, e, a, b, c, 72);
R4(c, d, e, a, b, 73);
R4(b, c, d, e, a, 74);
R4(a, b, c, d, e, 75);
R4(e, a, b, c, d, 76);
R4(d, e, a, b, c, 77);
R4(c, d, e, a, b, 78);
R4(b, c, d, e, a, 79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
/* Wipe variables */
a = b = c = d = e = 0;
#ifdef SHA1HANDSOFF
memset(block, '\0', sizeof(block));
#endif
}
/* SHA1Init - Initialize new context */
void SHA1Init(
SHA1_CTX * context
)
{
/* SHA1 initialization constants */
context->state[0] = 0x67452301;
context->state[1] = 0xEFCDAB89;
context->state[2] = 0x98BADCFE;
context->state[3] = 0x10325476;
context->state[4] = 0xC3D2E1F0;
context->count[0] = context->count[1] = 0;
}
/* Run your data through this. */
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
)
{
uint32_t i;
uint32_t j;
j = context->count[0];
if ((context->count[0] += len << 3) < j)
context->count[1]++;
context->count[1] += (len >> 29);
j = (j >> 3) & 63;
if ((j + len) > 63)
{
memcpy(&context->buffer[j], data, (i = 64 - j));
SHA1Transform(context->state, context->buffer);
for (; i + 63 < len; i += 64)
{
SHA1Transform(context->state, &data[i]);
}
j = 0;
}
else
i = 0;
memcpy(&context->buffer[j], &data[i], len - i);
}
/* Add padding and return the message digest. */
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
)
{
unsigned i;
unsigned char finalcount[8];
unsigned char c;
#if 0 /* untested "improvement" by DHR */
/* Convert context->count to a sequence of bytes
* in finalcount. Second element first, but
* big-endian order within element.
* But we do it all backwards.
*/
unsigned char *fcp = &finalcount[8];
for (i = 0; i < 2; i++)
{
uint32_t t = context->count[i];
int j;
for (j = 0; j < 4; t >>= 8, j++)
*--fcp = (unsigned char) t}
#else
for (i = 0; i < 8; i++)
{
finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */
}
#endif
c = 0200;
SHA1Update(context, &c, 1);
while ((context->count[0] & 504) != 448)
{
c = 0000;
SHA1Update(context, &c, 1);
}
SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
for (i = 0; i < 20; i++)
{
digest[i] = (unsigned char)
((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
}
/* Wipe variables */
memset(context, '\0', sizeof(*context));
memset(&finalcount, '\0', sizeof(finalcount));
}
void SHA1(
char *hash_out,
const char *str,
uint32_t len)
{
SHA1_CTX ctx;
unsigned int ii;
SHA1Init(&ctx);
for (ii=0; ii<len; ii+=1)
SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
SHA1Final((unsigned char *)hash_out, &ctx);
}

View file

@ -0,0 +1,52 @@
#ifndef SHA1_H
#define SHA1_H
/*
SHA-1 in C
By Steve Reid <steve@edmweb.com>
100% Public Domain
*/
#include "stdint.h"
#if defined(__cplusplus)
extern "C" {
#endif
typedef struct
{
uint32_t state[5];
uint32_t count[2];
unsigned char buffer[64];
} SHA1_CTX;
void SHA1Transform(
uint32_t state[5],
const unsigned char buffer[64]
);
void SHA1Init(
SHA1_CTX * context
);
void SHA1Update(
SHA1_CTX * context,
const unsigned char *data,
uint32_t len
);
void SHA1Final(
unsigned char digest[20],
SHA1_CTX * context
);
void SHA1(
char *hash_out,
const char *str,
uint32_t len);
#if defined(__cplusplus)
}
#endif
#endif /* SHA1_H */

View file

@ -0,0 +1,15 @@
{
"name": "sha256",
"version": "0.0.2",
"repo": "jb55/sha256.c",
"description": "sha256 in c",
"keywords": ["sha256", "sha2"],
"src": ["sha256.c", "sha256.h"],
"dependencies": {
"jb55/rotate-bits.h": "0.1.1"
},
"development": {
"thlorenz/tap.c": "*"
}
}

View file

@ -0,0 +1,221 @@
/* Crypto/Sha256.c -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "rotate-bits/rotate-bits.h"
#include "sha256.h"
/* define it for speed optimization */
#define _SHA256_UNROLL
#define _SHA256_UNROLL2
void
sha256_init(sha256_t *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define a(i) T[(0-(i))&7]
#define b(i) T[(1-(i))&7]
#define c(i) T[(2-(i))&7]
#define d(i) T[(3-(i))&7]
#define e(i) T[(4-(i))&7]
#define f(i) T[(5-(i))&7]
#define g(i) T[(6-(i))&7]
#define h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, (i+1)); \
R(g,h,a,b,c,d,e,f, (i+2)); \
R(f,g,h,a,b,c,d,e, (i+3)); \
R(e,f,g,h,a,b,c,d, (i+4)); \
R(d,e,f,g,h,a,b,c, (i+5)); \
R(c,d,e,f,g,h,a,b, (i+6)); \
R(b,c,d,e,f,g,h,a, (i+7))
#else
#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void
sha256_transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16] = {0};
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
static void
sha256_write_byte_block(sha256_t *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
sha256_transform(p->state, data32);
}
void
sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
{
sha256_t hash;
sha256_init(&hash);
sha256_update(&hash, data, size);
sha256_final(&hash, buf);
}
void
sha256_update(sha256_t *p, const unsigned char *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
sha256_write_byte_block(p);
}
}
}
void
sha256_final(sha256_t *p, unsigned char *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
sha256_write_byte_block(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
lenInBits <<= 8;
}
sha256_write_byte_block(p);
for (i = 0; i < 8; i++)
{
*digest++ = (unsigned char)(p->state[i] >> 24);
*digest++ = (unsigned char)(p->state[i] >> 16);
*digest++ = (unsigned char)(p->state[i] >> 8);
*digest++ = (unsigned char)(p->state[i]);
}
sha256_init(p);
}

View file

@ -0,0 +1,24 @@
/* Sha256.h -- SHA-256 Hash
2010-06-11 : Igor Pavlov : Public domain */
#ifndef __CRYPTO_SHA256_H
#define __CRYPTO_SHA256_H
#include <stdlib.h>
#include <stdint.h>
#define SHA256_DIGEST_SIZE 32
typedef struct sha256_t
{
uint32_t state[8];
uint64_t count;
unsigned char buffer[64];
} sha256_t;
void sha256_init(sha256_t *p);
void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
void sha256_final(sha256_t *p, unsigned char *digest);
void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
#endif

View file

@ -0,0 +1,12 @@
{
"name": "xxhash",
"version": "0.8.2",
"repo": "mofosyne/xxhash",
"description": "Extremely fast non-cryptographic hash algorithm",
"keywords": ["xxhash", "hashing"],
"license": "BSD-2-Clause",
"src": [
"xxhash.c",
"xxhash.h"
]
}

View file

@ -0,0 +1,42 @@
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
/*
* xxhash.c instantiates functions defined in xxhash.h
*/
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */
#include "xxhash.h"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,693 @@
#include "ggml.h"
#include <cstdlib> /* abort() */
#include <cstddef>
#include <cstdio>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <cstring>
#include <sstream>
#include <fstream>
#ifdef __cplusplus
extern "C" {
#endif
#include "xxhash/xxhash.h"
#include "sha1/sha1.h"
#include "sha256/sha256.h"
#ifdef __cplusplus
}
#endif
// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
#define HASH_TYPE_SHA256_STR "sha256"
#define HASH_TYPE_SHA1_STR "sha1"
#define HASH_TYPE_XXH64_STR "xxh64"
#define HASH_TYPE_UUID_STR "uuid"
typedef enum {
HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
HASH_EXIT_FAILURE = 1, // Generic Failure
HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
} hash_exit_code_t;
typedef enum {
HASH_MANIFEST_NOT_FOUND,
HASH_MANIFEST_MISMATCH,
HASH_MANIFEST_OK,
} hash_manifest_result_t;
struct hash_params {
std::string input;
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
bool no_layer = false;
bool manifest_is_usable = false;
std::string manifest_file;
};
struct manifest_check_params {
bool xxh64 = false;
bool sha1 = false;
bool sha256 = false;
bool uuid = false;
};
static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
switch (value) {
case HASH_MANIFEST_NOT_FOUND: return "Not Found";
case HASH_MANIFEST_MISMATCH: return "Mismatch";
case HASH_MANIFEST_OK: return "Ok";
}
return "?";
}
static char const * hash_exit_code_to_str(hash_exit_code_t value) {
switch (value) {
case HASH_EXIT_SUCCESS: return "Success";
case HASH_EXIT_FAILURE: return "Failure";
case HASH_EXIT_MISMATCH: return "Mismatch";
case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
}
return "?";
}
static void hash_print_usage(const char * executable) {
const hash_params default_params;
printf("\n");
printf("usage: %s [options] GGUF_IN\n", executable);
printf("\n");
printf("Hash a GGUF file");
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --xxh64 use xxh64 hash\n");
printf(" --sha1 use sha1 hash\n");
printf(" --sha256 use sha256 hash\n");
printf(" --all use all hash\n");
printf(" --no-layer exclude per layer hash\n");
printf(" --uuid generate UUIDv5 ID\n");
printf(" -c, --check <manifest> verify against a manifest\n");
printf("\n");
}
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
std::string arg;
bool invalid_param = false;
const std::string arg_prefix = "--";
int arg_idx = 1;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
arg = argv[arg_idx];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
bool arg_found = false;
if (arg == "-h" || arg == "--help") {
hash_print_usage(argv[0]);
exit(0);
}
if (arg == "--xxh64") {
arg_found = true;
params.xxh64 = true;
}
if (arg == "--sha1") {
arg_found = true;
params.sha1 = true;
}
if (arg == "--uuid") {
arg_found = true;
params.uuid = true;
}
if (arg == "--sha256") {
arg_found = true;
params.sha256 = true;
}
if (arg == "--all") {
arg_found = true;
params.sha256 = true;
params.sha1 = true;
params.xxh64 = true;
}
if (arg == "--no-layer") {
arg_found = true;
params.no_layer = true;
}
if (arg == "-c" || arg == "--check") {
if (++arg_idx >= argc) {
invalid_param = true;
break;
}
arg_found = true;
params.manifest_file = argv[arg_idx];
}
if (!arg_found) {
throw std::invalid_argument("error: unknown argument: " + arg);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument:" + arg);
}
if (argc - arg_idx < 1) {
throw std::invalid_argument("error: bad arguments");
}
params.input = argv[arg_idx++];
}
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
bool result = true;
try {
hash_params_parse_ex(argc, argv, params);
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
hash_print_usage(argv[0]);
exit(EXIT_FAILURE);
}
return result;
}
static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
if (manifest_file.empty()) {
return false;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return false;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
// hash_type_str hash_str tensor_name
// e.g. 'xxh64 f66e9cd66a4396a0 test.gguf:tensor_0'
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
if (line_stream >> file_hash_type) {
if (file_hash_type == HASH_TYPE_SHA256_STR) {
manifest_check.sha256 = true;
} else if (file_hash_type == HASH_TYPE_SHA1_STR) {
manifest_check.sha1 = true;
} else if (file_hash_type == HASH_TYPE_XXH64_STR) {
manifest_check.xxh64 = true;
} else if (file_hash_type == HASH_TYPE_UUID_STR) {
manifest_check.uuid = true;
}
}
}
return true;
}
static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
if (manifest_file.empty()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::ifstream file(manifest_file);
if (!file.is_open()) {
return HASH_MANIFEST_NOT_FOUND;
}
std::string manifest_entry_line;
while (getline(file, manifest_entry_line)) {
std::istringstream line_stream(manifest_entry_line);
std::string file_hash_type;
std::string file_hash;
std::string file_tensor_name;
if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
// Line parsed. Check hash validity
if (file_hash_type != hash_type_str) {
continue;
}
if (file_tensor_name != tensor_name) {
continue;
}
return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
}
}
return HASH_MANIFEST_NOT_FOUND;
}
static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
// Assumes that digest was processed correctly with the expected namespace
for (int i = 0; i < 16; i++) {
uuid[i] = sha1_digest[i];
}
// Set bits corresponding to UUID ver 5
uuid[ 6] &= ~(0xF << 4);
uuid[ 6] |= (5 << 4);
// Set bits corresponding to UUID variant 0b10XX
uuid[ 8] &= ~(0xc << 4);
uuid[ 8] |= (0x8 << 4);
}
static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
const std::string & fname = hash_params.input;
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
// xxh64 init
XXH64_state_t* xxh64_model_hash_state = NULL;
if (hash_params.xxh64) {
xxh64_model_hash_state = XXH64_createState();
if (xxh64_model_hash_state==NULL) {
abort();
}
XXH64_hash_t const seed = 0;
if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
abort();
}
}
// sha1 init
SHA1_CTX sha1_model_hash_ctx;
if (hash_params.sha1) {
SHA1Init(&sha1_model_hash_ctx);
}
// sha256 init
sha256_t sha256_model_hash_ctx;
if (hash_params.sha256) {
sha256_init(&sha256_model_hash_ctx);
}
// sha1 for uuid init
SHA1_CTX sha1_for_uuid_ctx;
if (hash_params.uuid) {
unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
SHA1Init(&sha1_for_uuid_ctx);
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
}
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
const int n_tensors = gguf_get_n_tensors(ctx);
bool tensor_layer_in_manifest = false;
bool model_in_manifest = false;
bool tensor_layer_has_mismatch = false;
bool model_has_mismatch = false;
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
auto n_bytes = ggml_nbytes(cur);
auto *raw_data = cur->data;
const std::string tensor_layer_name = fname + ":" + name;
if (hash_params.xxh64) {
if (!hash_params.no_layer) {
// Per Layer Hash
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
}
if (hash_params.sha1) {
if (!hash_params.no_layer) {
// Per Layer Hash
char result[21]; // sha1 outputs 20 bytes
SHA1( result, (const char *)raw_data, n_bytes);
char hex_result[41] = {0};
for (int offset = 0; offset < 20; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.sha256) {
if (!hash_params.no_layer) {
// Per Layer Hash
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
tensor_layer_in_manifest = true;
tensor_layer_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
tensor_layer_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
}
}
// Overall Model Hash
sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
}
if (hash_params.uuid) {
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
}
}
if (hash_params.xxh64) {
XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha1) {
unsigned char result[21];
SHA1Final(result, &sha1_model_hash_ctx);
char hex_result[41];
for (int offset = 0; offset < 20; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
}
}
if (hash_params.sha256) {
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
sha256_final( &sha256_model_hash_ctx, result);
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
}
}
if (hash_params.uuid) {
unsigned char result[21];
SHA1Final(result, &sha1_for_uuid_ctx);
unsigned char uuid[16];
generate_uuidv5(result, uuid);
char string_buffer[37] = {0};
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],
uuid[12], uuid[13], uuid[14], uuid[15]);
if (hash_params.manifest_is_usable) {
hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
switch (verify_result) {
case HASH_MANIFEST_NOT_FOUND:
break;
case HASH_MANIFEST_MISMATCH:
model_in_manifest = true;
model_has_mismatch = true;
break;
case HASH_MANIFEST_OK:
model_in_manifest = true;
break;
}
printf("%-8s %-s %s - %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
} else {
printf("%-8s %-s %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
}
}
ggml_free(ctx_data);
gguf_free(ctx);
if (hash_params.manifest_is_usable) {
// In hash verification mode
if (!model_in_manifest) {
// model missing in manifest?
// Check tensor layer...
if (!tensor_layer_in_manifest) {
// Still missing? Maybe we are reading the wrong manifest.
return HASH_EXIT_MANIFEST_MISSING_ENTRY;
}
if (tensor_layer_has_mismatch) {
// Per tensor check found error
return HASH_EXIT_FAILURE;
}
// All per tensor layer checks passed? Sounds good enough.
return HASH_EXIT_SUCCESS;
}
// Overall model check passed, but let's check per layer just in case
// If missing, we don't care too much as the overall model checked
if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
return HASH_EXIT_FAILURE;
}
if (model_has_mismatch) {
// model has failed hash somewhere in the model
return HASH_EXIT_FAILURE;
}
// All checks appears to be fine
return HASH_EXIT_SUCCESS;
}
// In hash generation mode
return HASH_EXIT_SUCCESS;
}
int main(int argc, const char ** argv) {
hash_params params;
manifest_check_params manifest_check;
hash_params_parse(argc, argv, params);
if (!params.manifest_file.empty()) {
if (!manifest_type(params.manifest_file, manifest_check)) {
printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_FILE_ERROR;
}
if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
}
printf("manifest %s", params.manifest_file.c_str());
if (manifest_check.sha256) {
printf(" sha256");
}
if (manifest_check.sha1) {
printf(" sha1");
}
if (manifest_check.xxh64) {
printf(" xxh64");
}
if (manifest_check.uuid) {
printf(" uuid");
}
printf("\n");
// Autoselect the highest security hash if manifest is provided but
// the user has not specifically defined the hash they care about
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
// User has not selected a specific value, pick most secure hash
if (manifest_check.sha256) {
params.sha256 = true;
} else if (manifest_check.sha1) {
params.sha1 = true;
} else if (manifest_check.xxh64) {
params.xxh64 = true;
} else if (manifest_check.uuid) {
params.uuid = true;
}
}
params.manifest_is_usable = true;
}
// By default if no swich argument provided, assume xxh64
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
params.xxh64 = true;
}
hash_exit_code_t exit_code = gguf_hash(params);
if (params.manifest_is_usable) {
printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
}
return exit_code;
}

View file

@ -205,21 +205,17 @@ int main(int argc, char ** argv) {
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos); LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end; std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) { GGML_ASSERT(llama_token_prefix(model) >= 0);
inp_sfx.erase(inp_sfx.begin()); GGML_ASSERT(llama_token_suffix(model) >= 0);
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {
@ -517,19 +513,14 @@ int main(int argc, char ** argv) {
string_process_escapes(params.input_prefix); string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix); string_process_escapes(params.input_suffix);
} }
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
// tokenize new prefix and suffix // tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {

View file

@ -3,7 +3,7 @@
#! pip install pydantic #! pip install pydantic
#! python json_schema_pydantic_example.py #! python json_schema_pydantic_example.py
from pydantic import BaseModel, Extra, TypeAdapter from pydantic import BaseModel, Field, TypeAdapter
from annotated_types import MinLen from annotated_types import MinLen
from typing import Annotated, List, Optional from typing import Annotated, List, Optional
import json, requests import json, requests
@ -17,6 +17,9 @@ if True:
The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below) The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
''' '''
response_format = None
type_adapter = None
if response_model: if response_model:
type_adapter = TypeAdapter(response_model) type_adapter = TypeAdapter(response_model)
schema = type_adapter.json_schema() schema = type_adapter.json_schema()

View file

@ -1,4 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations
import argparse import argparse
import itertools import itertools
import json import json
@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
raise RuntimeError("At least one of min_value or max_value must be set") raise RuntimeError("At least one of min_value or max_value must be set")
class BuiltinRule: class BuiltinRule:
def __init__(self, content: str, deps: list = None): def __init__(self, content: str, deps: list | None = None):
self.content = content self.content = content
self.deps = deps or [] self.deps = deps or []
@ -248,7 +250,7 @@ class SchemaConverter:
def _format_literal(self, literal): def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
) )
return f'"{escaped}"' return f'"{escaped}"'
@ -403,11 +405,11 @@ class SchemaConverter:
i = 0 i = 0
length = len(pattern) length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str: def to_rule(s: tuple[str, bool]) -> str:
(txt, is_literal) = s (txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]: def transform() -> tuple[str, bool]:
''' '''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
''' '''
@ -420,7 +422,7 @@ class SchemaConverter:
# We only need a flat structure here to apply repetition operators to the last item, and # We only need a flat structure here to apply repetition operators to the last item, and
# to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
# (GBNF's syntax is luckily very close to regular expressions!) # (GBNF's syntax is luckily very close to regular expressions!)
seq: list[Tuple[str, bool]] = [] seq: list[tuple[str, bool]] = []
def get_dot(): def get_dot():
if self._dotall: if self._dotall:

View file

@ -185,6 +185,8 @@ else:
fout.add_description("two-tower CLIP model") fout.add_description("two-tower CLIP model")
if has_text_encoder: if has_text_encoder:
assert t_hparams is not None
assert tokens is not None
# text_model hparams # text_model hparams
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@ -259,8 +261,8 @@ if has_vision_encoder:
if processor is not None: if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue]
else: else:
image_mean = args.image_mean if args.image_mean is not None else default_image_mean image_mean = args.image_mean if args.image_mean is not None else default_image_mean
image_std = args.image_std if args.image_std is not None else default_image_std image_std = args.image_std if args.image_std is not None else default_image_std
@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
if has_llava_projector: if has_llava_projector:
model.vision_model.encoder.layers.pop(-1) model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
projector = torch.load(args.llava_projector) projector = torch.load(args.llava_projector)
for name, data in projector.items(): for name, data in projector.items():
name = get_tensor_name(name) name = get_tensor_name(name)
@ -286,7 +288,7 @@ if has_llava_projector:
print("Projector tensors added\n") print("Projector tensors added\n")
state_dict = model.state_dict() state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
for name, data in state_dict.items(): for name, data in state_dict.items():
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
# we don't need this # we don't need this

View file

@ -2,7 +2,9 @@ import argparse
import glob import glob
import os import os
import torch import torch
from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file from safetensors import safe_open
from safetensors.torch import save_file
from typing import Any, ContextManager, cast
# Function to determine if file is a SafeTensor file # Function to determine if file is a SafeTensor file
def is_safetensor_file(file_path): def is_safetensor_file(file_path):
@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
def load_model(file_path): def load_model(file_path):
if is_safetensor_file(file_path): if is_safetensor_file(file_path):
tensors = {} tensors = {}
with safe_open(file_path, framework="pt", device="cpu") as f: with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
for key in f.keys(): for key in f.keys():
tensors[key] = f.get_tensor(key).clone() tensors[key] = f.get_tensor(key).clone()
# output shape # output shape
@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
if last_checkpoint is not None: if last_checkpoint is not None:
for k, v in last_checkpoint.items(): for k, v in last_checkpoint.items():
print(k) print(k)
print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
print("No tensors found. Is this a LLaVA model?") print("No tensors found. Is this a LLaVA model?")
exit() exit()
@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
# projector = {name: checkpoint.[name].float() for name in mm_tensors} # projector = {name: checkpoint.[name].float() for name in mm_tensors}
projector = {} projector = {}
for name in mm_tensors: for name in mm_tensors:
assert last_checkpoint is not None
projector[name] = last_checkpoint[name].float() projector[name] = last_checkpoint[name].float()
for name in first_mm_tensors: for name in first_mm_tensors:
assert first_checkpoint is not None
projector[name] = first_checkpoint[name].float() projector[name] = first_checkpoint[name].float()
if len(projector) > 0: if len(projector) > 0:

View file

@ -1,3 +1,4 @@
-r ../../requirements/requirements-convert_legacy_llama.txt -r ../../requirements/requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
pillow~=10.2.0 pillow~=10.2.0
torch~=2.2.1 torch~=2.2.1

View file

@ -1,6 +1,6 @@
# llama.cpp/examples/main # llama.cpp/examples/main
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
## Table of Contents ## Table of Contents
@ -17,60 +17,59 @@ This example program allows you to use various LLaMA language models in an easy
To get started right away, run the following command, making sure to use the correct path for the model you have: To get started right away, run the following command, making sure to use the correct path for the model you have:
#### Unix-based systems (Linux, macOS, etc.): First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
Once downloaded, place your model in the models folder in llama.cpp.
### Unix-based systems (Linux, macOS, etc.):
##### Input prompt (One-and-done)
```bash ```bash
./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time" ./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
``` ```
##### Conversation mode (Allow for continuous interaction with the model)
#### Windows:
```powershell
llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
```
For an interactive experience, try this command:
#### Unix-based systems (Linux, macOS, etc.):
```bash ```bash
./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ ./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
'User: Hi
AI: Hello. I am an AI chatbot. Would you like to talk?
User: Sure!
AI: What would you like to talk about?
User:'
``` ```
#### Windows: ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
```powershell
llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
```
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
#### Unix-based systems (Linux, macOS, etc.):
```bash ```bash
./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1 ./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
``` ```
#### Windows: ### Windows:
##### Input prompt (One-and-done)
```powershell
./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
```
##### Conversation mode (Allow for continuous interaction with the model)
```powershell ```powershell
llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 ./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
```
#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
```powershell
llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
``` ```
## Common Options ## Common Options
In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models: In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set). - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
## Input Prompts ## Input Prompts
@ -90,6 +89,7 @@ In interactive mode, users can participate in text generation by injecting their
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. - `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. - `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. - `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@ -117,6 +117,13 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
```sh ```sh
./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:" ./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
``` ```
When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
### Chat templates
`--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
Example usage: `--chat-template gemma`
## Context Management ## Context Management
@ -124,9 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th
### Context Size ### Context Size
The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations. - `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Extended Context Size ### Extended Context Size
@ -148,15 +153,15 @@ The following options allow you to control the text generation process and fine-
### Number of Tokens to Predict ### Number of Tokens to Predict
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled) - `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output. A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
### Temperature ### Temperature
@ -164,15 +169,15 @@ It is important to note that the generated text may be shorter than the specifie
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
Example usage: `--temp 0.5` Example usage: `--temp 0`
### Repeat Penalty ### Repeat Penalty
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1). - `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). - `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. - `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
@ -196,19 +201,19 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
Example usage: `--top-p 0.95` Example usage: `--top-p 0.95`
### Min P Sampling ### Min-P Sampling
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05). - `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
Example usage: `--min-p 0.05` Example usage: `--min-p 0.05`
### Tail Free Sampling (TFS) ### Tail-Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS. Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
Example usage: `--tfs 0.95` Example usage: `--tfs 0.95`
@ -307,10 +312,8 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text. - `--verbose-prompt`: Print the prompt before generating text.
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache. - `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

View file

@ -6,10 +6,10 @@ import re
from copy import copy from copy import copy
from enum import Enum from enum import Enum
from inspect import getdoc, isclass from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
from docstring_parser import parse from docstring_parser import parse
from pydantic import BaseModel, Field, create_model from pydantic import BaseModel, create_model
if TYPE_CHECKING: if TYPE_CHECKING:
from types import GenericAlias from types import GenericAlias
@ -17,6 +17,9 @@ else:
# python 3.8 compat # python 3.8 compat
from typing import _GenericAlias as GenericAlias from typing import _GenericAlias as GenericAlias
# TODO: fix this
# pyright: reportAttributeAccessIssue=information
class PydanticDataType(Enum): class PydanticDataType(Enum):
""" """
@ -234,8 +237,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
# Define the integer part rule # Define the integer part rule
integer_part_rule = ( integer_part_rule = (
"integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + ( "integer-part"
f"-min{min_digit}" if min_digit is not None else "") + (f"-max{max_digit}" if max_digit is not None else "")
+ (f"-min{min_digit}" if min_digit is not None else "")
) )
# Define the fractional part rule based on precision constraints # Define the fractional part rule based on precision constraints
@ -458,7 +462,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
if not issubclass(model, BaseModel): if not issubclass(model, BaseModel):
# For non-Pydantic classes, generate model_fields from __annotations__ or __init__ # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
if hasattr(model, "__annotations__") and model.__annotations__: if hasattr(model, "__annotations__") and model.__annotations__:
model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} # pyright: ignore[reportGeneralTypeIssues]
else: else:
init_signature = inspect.signature(model.__init__) init_signature = inspect.signature(model.__init__)
parameters = init_signature.parameters parameters = init_signature.parameters
@ -680,7 +684,7 @@ def generate_markdown_documentation(
str: Generated text documentation. str: Generated text documentation.
""" """
documentation = "" documentation = ""
pyd_models = [(model, True) for model in pydantic_models] pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
for model, add_prefix in pyd_models: for model, add_prefix in pyd_models:
if add_prefix: if add_prefix:
documentation += f"{model_prefix}: {model.__name__}\n" documentation += f"{model_prefix}: {model.__name__}\n"
@ -700,7 +704,7 @@ def generate_markdown_documentation(
# Indenting the fields section # Indenting the fields section
documentation += f" {fields_prefix}:\n" documentation += f" {fields_prefix}:\n"
else: else:
documentation += f" Fields:\n" documentation += f" Fields:\n" # noqa: F541
if isclass(model) and issubclass(model, BaseModel): if isclass(model) and issubclass(model, BaseModel):
for name, field_type in model.__annotations__.items(): for name, field_type in model.__annotations__.items():
# if name == "markdown_code_block": # if name == "markdown_code_block":
@ -778,7 +782,7 @@ def generate_field_markdown(
return field_text return field_text
if field_description != "": if field_description != "":
field_text += f" Description: " + field_description + "\n" field_text += f" Description: {field_description}\n"
# Check for and include field-specific examples if available # Check for and include field-specific examples if available
if hasattr(model, "Config") and hasattr(model.Config, if hasattr(model, "Config") and hasattr(model.Config,
@ -833,7 +837,7 @@ def generate_text_documentation(
str: Generated text documentation. str: Generated text documentation.
""" """
documentation = "" documentation = ""
pyd_models = [(model, True) for model in pydantic_models] pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
for model, add_prefix in pyd_models: for model, add_prefix in pyd_models:
if add_prefix: if add_prefix:
documentation += f"{model_prefix}: {model.__name__}\n" documentation += f"{model_prefix}: {model.__name__}\n"
@ -1164,7 +1168,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
dynamic_fields[param.name] = ( dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model # Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload] dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
for name, param_doc in param_docs: for name, param_doc in param_docs:
dynamic_model.model_fields[name].description = param_doc.description dynamic_model.model_fields[name].description = param_doc.description
@ -1228,9 +1232,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
return output return output
from enum import Enum
def json_schema_to_python_types(schema): def json_schema_to_python_types(schema):
type_map = { type_map = {
"any": Any, "any": Any,
@ -1275,7 +1276,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
if items != {}: if items != {}:
array = {"properties": items} array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...) # type: ignore[valid-type] fields[field_name] = (List[array_type], ...)
else: else:
fields[field_name] = (list, ...) fields[field_name] = (list, ...)
elif field_type == "object": elif field_type == "object":
@ -1285,7 +1286,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
required = field_data.get("enum", []) required = field_data.get("enum", [])
for key, field in fields.items(): for key, field in fields.items():
if key not in required: if key not in required:
fields[key] = (Optional[fields[key][0]], ...) optional_type = fields[key][0]
fields[key] = (Optional[optional_type], ...)
else: else:
field_type = json_schema_to_python_types(field_type) field_type = json_schema_to_python_types(field_type)
fields[field_name] = (field_type, ...) fields[field_name] = (field_type, ...)
@ -1305,6 +1307,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
required = dictionary.get("required", []) required = dictionary.get("required", [])
for key, field in fields.items(): for key, field in fields.items():
if key not in required: if key not in required:
fields[key] = (Optional[fields[key][0]], ...) optional_type = fields[key][0]
fields[key] = (Optional[optional_type], ...)
custom_model = create_model(model_name, **fields) custom_model = create_model(model_name, **fields)
return custom_model return custom_model

View file

@ -1,6 +1,7 @@
# Function calling example using pydantic models. # Function calling example using pydantic models.
from __future__ import annotations
import datetime import datetime
import importlib
import json import json
from enum import Enum from enum import Enum
from typing import Optional, Union from typing import Optional, Union
@ -215,9 +216,9 @@ for call in json_data:
if call["function"] == "Calculator": if call["function"] == "Calculator":
print(Calculator(**call["params"]).run()) print(Calculator(**call["params"]).run())
elif call["function"] == "get_current_datetime": elif call["function"] == "get_current_datetime":
print(current_datetime_model(**call["params"]).run()) print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
elif call["function"] == "get_current_weather": elif call["function"] == "get_current_weather":
print(current_weather_tool_model(**call["params"]).run()) print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
# Should output something like this: # Should output something like this:
# 2024-01-14 13:36:06 # 2024-01-14 13:36:06
# {"location": "London", "temperature": "42", "unit": "celsius"} # {"location": "London", "temperature": "42", "unit": "celsius"}

View file

@ -47,6 +47,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

View file

@ -366,7 +366,8 @@ Notice that each `probs` is an array of length `n_probs`.
"assistant_name": "", "assistant_name": "",
"user_name": "", "user_name": "",
"default_generation_settings": { ... }, "default_generation_settings": { ... },
"total_slots": 1 "total_slots": 1,
"chat_template": ""
} }
``` ```
@ -374,6 +375,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `chat_template` - the model's original Jinja2 prompt template
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

View file

@ -1,3 +1,5 @@
from __future__ import annotations
import argparse import argparse
import json import json
import os import os
@ -59,10 +61,11 @@ def main(args_in: list[str] | None = None) -> None:
sys.exit(1) sys.exit(1)
# start the benchmark # start the benchmark
iterations = 0
data = {}
try: try:
start_benchmark(args) start_benchmark(args)
iterations = 0
with open("results.github.env", 'w') as github_env: with open("results.github.env", 'w') as github_env:
# parse output # parse output
with open('k6-results.json', 'r') as bench_results: with open('k6-results.json', 'r') as bench_results:
@ -129,7 +132,7 @@ def main(args_in: list[str] | None = None) -> None:
timestamps, metric_values = zip(*values) timestamps, metric_values = zip(*values)
metric_values = [float(value) for value in metric_values] metric_values = [float(value) for value in metric_values]
prometheus_metrics[metric] = metric_values prometheus_metrics[metric] = metric_values
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
plt.figure(figsize=(16, 10), dpi=80) plt.figure(figsize=(16, 10), dpi=80)
plt.plot(timestamps_dt, metric_values, label=metric) plt.plot(timestamps_dt, metric_values, label=metric)
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
@ -156,7 +159,7 @@ def main(args_in: list[str] | None = None) -> None:
plt.close() plt.close()
# Mermaid format in case images upload failed # Mermaid format in case images upload failed
with (open(f"{metric}.mermaid", 'w') as mermaid_f): with open(f"{metric}.mermaid", 'w') as mermaid_f:
mermaid = ( mermaid = (
f"""--- f"""---
config: config:
@ -278,7 +281,7 @@ def start_server_background(args):
} }
server_process = subprocess.Popen( server_process = subprocess.Popen(
args, args,
**pkwargs) **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue]
def server_log(in_stream, out_stream): def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''): for line in iter(in_stream.readline, b''):

View file

@ -738,6 +738,8 @@ struct server_context {
slot.ga_n = ga_n; slot.ga_n = ga_n;
slot.ga_w = ga_w; slot.ga_w = ga_w;
slot.sparams = params.sparams;
slot.reset(); slot.reset();
slots.push_back(slot); slots.push_back(slot);
@ -885,7 +887,8 @@ struct server_context {
bool launch_slot_with_task(server_slot & slot, const server_task & task) { bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot_params default_params; slot_params default_params;
llama_sampling_params default_sparams; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
llama_sampling_params default_sparams = params.sparams;
auto & data = task.data; auto & data = task.data;
if (data.count("__oaicompat") != 0) { if (data.count("__oaicompat") != 0) {
@ -2606,7 +2609,7 @@ int main(int argc, char ** argv) {
// if a custom chat template is not supplied, we will use the one that comes with the model (if any) // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) { if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) { if (!ctx_server.validate_model_chat_template()) {
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
params.chat_template = "chatml"; params.chat_template = "chatml";
} }
} }
@ -2968,11 +2971,20 @@ int main(int argc, char ** argv) {
}; };
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
std::string template_key = "tokenizer.chat_template", curr_tmpl;
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
if (tlen > 0) {
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
}
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = { json data = {
{ "system_prompt", ctx_server.system_prompt.c_str() }, { "system_prompt", ctx_server.system_prompt.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel } { "total_slots", ctx_server.params.n_parallel },
{ "chat_template", curr_tmpl.c_str() }
}; };
res.set_content(data.dump(), "application/json; charset=utf-8"); res.set_content(data.dump(), "application/json; charset=utf-8");

View file

@ -1,5 +1,4 @@
import asyncio import asyncio
import collections
import json import json
import os import os
import re import re
@ -8,19 +7,23 @@ import subprocess
import sys import sys
import threading import threading
import time import time
from collections.abc import Sequence
from contextlib import closing from contextlib import closing
from re import RegexFlag from re import RegexFlag
from typing import Any, Literal, cast
import aiohttp import aiohttp
import numpy as np import numpy as np
import openai import openai
from behave import step from openai.types.chat import ChatCompletionChunk
from behave import step # pyright: ignore[reportAttributeAccessIssue]
from behave.api.async_step import async_run_until_complete from behave.api.async_step import async_run_until_complete
from prometheus_client import parser from prometheus_client import parser
# pyright: reportRedeclaration=false
@step("a server listening on {server_fqdn}:{server_port}") @step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn, server_port): def step_server_config(context, server_fqdn: str, server_port: str):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
context.server_port = int(server_port) context.server_port = int(server_port)
context.n_threads = None context.n_threads = None
@ -74,34 +77,34 @@ def step_server_config(context, server_fqdn, server_port):
@step('a model file {hf_file} from HF repo {hf_repo}') @step('a model file {hf_file} from HF repo {hf_repo}')
def step_download_hf_model(context, hf_file, hf_repo): def step_download_hf_model(context, hf_file: str, hf_repo: str):
context.model_hf_repo = hf_repo context.model_hf_repo = hf_repo
context.model_hf_file = hf_file context.model_hf_file = hf_file
context.model_file = os.path.basename(hf_file) context.model_file = os.path.basename(hf_file)
@step('a model file {model_file}') @step('a model file {model_file}')
def step_model_file(context, model_file): def step_model_file(context, model_file: str):
context.model_file = model_file context.model_file = model_file
@step('a model url {model_url}') @step('a model url {model_url}')
def step_model_url(context, model_url): def step_model_url(context, model_url: str):
context.model_url = model_url context.model_url = model_url
@step('a model alias {model_alias}') @step('a model alias {model_alias}')
def step_model_alias(context, model_alias): def step_model_alias(context, model_alias: str):
context.model_alias = model_alias context.model_alias = model_alias
@step('{seed:d} as server seed') @step('{seed:d} as server seed')
def step_seed(context, seed): def step_seed(context, seed: int):
context.server_seed = seed context.server_seed = seed
@step('{ngl:d} GPU offloaded layers') @step('{ngl:d} GPU offloaded layers')
def step_n_gpu_layer(context, ngl): def step_n_gpu_layer(context, ngl: int):
if 'N_GPU_LAYERS' in os.environ: if 'N_GPU_LAYERS' in os.environ:
new_ngl = int(os.environ['N_GPU_LAYERS']) new_ngl = int(os.environ['N_GPU_LAYERS'])
if context.debug: if context.debug:
@ -111,37 +114,37 @@ def step_n_gpu_layer(context, ngl):
@step('{n_threads:d} threads') @step('{n_threads:d} threads')
def step_n_threads(context, n_threads): def step_n_threads(context, n_threads: int):
context.n_thread = n_threads context.n_thread = n_threads
@step('{draft:d} as draft') @step('{draft:d} as draft')
def step_draft(context, draft): def step_draft(context, draft: int):
context.draft = draft context.draft = draft
@step('{n_ctx:d} KV cache size') @step('{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx): def step_n_ctx(context, n_ctx: int):
context.n_ctx = n_ctx context.n_ctx = n_ctx
@step('{n_slots:d} slots') @step('{n_slots:d} slots')
def step_n_slots(context, n_slots): def step_n_slots(context, n_slots: int):
context.n_slots = n_slots context.n_slots = n_slots
@step('{n_predict:d} server max tokens to predict') @step('{n_predict:d} server max tokens to predict')
def step_server_n_predict(context, n_predict): def step_server_n_predict(context, n_predict: int):
context.n_server_predict = n_predict context.n_server_predict = n_predict
@step('{slot_save_path} as slot save path') @step('{slot_save_path} as slot save path')
def step_slot_save_path(context, slot_save_path): def step_slot_save_path(context, slot_save_path: str):
context.slot_save_path = slot_save_path context.slot_save_path = slot_save_path
@step('using slot id {id_slot:d}') @step('using slot id {id_slot:d}')
def step_id_slot(context, id_slot): def step_id_slot(context, id_slot: int):
context.id_slot = id_slot context.id_slot = id_slot
@ -191,7 +194,7 @@ def step_start_server(context):
@step("the server is {expecting_status}") @step("the server is {expecting_status}")
@async_run_until_complete @async_run_until_complete
async def step_wait_for_the_server_to_be_started(context, expecting_status): async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
match expecting_status: match expecting_status:
case 'healthy': case 'healthy':
await wait_for_health_status(context, context.base_url, 200, 'ok', await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -221,7 +224,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
@step('all slots are {expected_slot_status_string}') @step('all slots are {expected_slot_status_string}')
@async_run_until_complete @async_run_until_complete
async def step_all_slots_status(context, expected_slot_status_string): async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
match expected_slot_status_string: match expected_slot_status_string:
case 'idle': case 'idle':
expected_slot_status = 0 expected_slot_status = 0
@ -237,7 +240,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
@step('a completion request with {api_error} api error') @step('a completion request with {api_error} api error')
@async_run_until_complete @async_run_until_complete
async def step_request_completion(context, api_error): async def step_request_completion(context, api_error: Literal['raised'] | str):
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1) seeds = await completions_seed(context, num_seeds=1)
completion = await request_completion(context.prompts.pop(), completion = await request_completion(context.prompts.pop(),
@ -777,8 +780,8 @@ def step_assert_metric_value(context, metric_name, metric_value):
def step_available_models(context): def step_available_models(context):
# openai client always expects an api_key # openai client always expects an api_key
openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
openai.api_base = f'{context.base_url}/v1' openai.base_url = f'{context.base_url}/v1/'
context.models = openai.Model.list().data context.models = openai.models.list().data
@step('{n_model:d} models are supported') @step('{n_model:d} models are supported')
@ -789,7 +792,7 @@ def step_supported_models(context, n_model):
@step('model {i_model:d} is {param} {preposition} {param_value}') @step('model {i_model:d} is {param} {preposition} {param_value}')
def step_supported_models(context, i_model, param, preposition, param_value): def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str):
assert i_model < len(context.models) assert i_model < len(context.models)
model = context.models[i_model] model = context.models[i_model]
@ -798,7 +801,7 @@ def step_supported_models(context, i_model, param, preposition, param_value):
case 'identified': case 'identified':
value = model.id value = model.id
case 'trained': case 'trained':
value = str(model.meta.n_ctx_train) value = str(model.meta["n_ctx_train"])
case _: case _:
assert False, "param {param} not supported" assert False, "param {param} not supported"
assert param_value == value, f"model param {param} {value} != {param_value}" assert param_value == value, f"model param {param} {value} != {param_value}"
@ -810,6 +813,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
print(f"starting {context.n_prompts} concurrent completion requests...") print(f"starting {context.n_prompts} concurrent completion requests...")
assert context.n_prompts > 0 assert context.n_prompts > 0
seeds = await completions_seed(context) seeds = await completions_seed(context)
assert seeds is not None
for prompt_no in range(context.n_prompts): for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
@ -861,7 +865,7 @@ async def request_completion(prompt,
id_slot=None, id_slot=None,
expect_api_error=None, expect_api_error=None,
user_api_key=None, user_api_key=None,
temperature=None): temperature=None) -> int | dict[str, Any]:
if debug: if debug:
print(f"Sending completion request: {prompt}") print(f"Sending completion request: {prompt}")
origin = "my.super.domain" origin = "my.super.domain"
@ -899,8 +903,8 @@ async def request_completion(prompt,
async def oai_chat_completions(user_prompt, async def oai_chat_completions(user_prompt,
seed, seed,
system_prompt, system_prompt,
base_url, base_url: str,
base_path, base_path: str,
async_client, async_client,
debug=False, debug=False,
temperature=None, temperature=None,
@ -909,7 +913,7 @@ async def oai_chat_completions(user_prompt,
enable_streaming=None, enable_streaming=None,
response_format=None, response_format=None,
user_api_key=None, user_api_key=None,
expect_api_error=None): expect_api_error=None) -> int | dict[str, Any]:
if debug: if debug:
print(f"Sending OAI Chat completions request: {user_prompt}") print(f"Sending OAI Chat completions request: {user_prompt}")
# openai client always expects an api key # openai client always expects an api key
@ -989,32 +993,35 @@ async def oai_chat_completions(user_prompt,
else: else:
try: try:
openai.api_key = user_api_key openai.api_key = user_api_key
openai.api_base = f'{base_url}{base_path}' openai.base_url = f'{base_url}{base_path.removesuffix("chat")}'
chat_completion = openai.Completion.create( assert model is not None
chat_completion = openai.chat.completions.create(
messages=payload['messages'], messages=payload['messages'],
model=model, model=model,
max_tokens=n_predict, max_tokens=n_predict,
stream=enable_streaming, stream=enable_streaming,
response_format=payload.get('response_format'), response_format=payload.get('response_format') or openai.NOT_GIVEN,
seed=seed, seed=seed,
temperature=payload['temperature'] temperature=payload['temperature']
) )
except openai.error.AuthenticationError as e: except openai.AuthenticationError as e:
if expect_api_error is not None and expect_api_error: if expect_api_error is not None and expect_api_error:
return 401 return 401
else: else:
assert False, f'error raised: {e}' assert False, f'error raised: {e}'
if enable_streaming: if enable_streaming:
chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion)
for chunk in chat_completion: for chunk in chat_completion:
assert len(chunk.choices) == 1 assert len(chunk.choices) == 1
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
if 'content' in delta: if delta.content is not None:
completion_response['content'] += delta['content'] completion_response['content'] += delta.content
completion_response['timings']['predicted_n'] += 1 completion_response['timings']['predicted_n'] += 1
completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
else: else:
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
assert chat_completion.usage is not None
completion_response = { completion_response = {
'content': chat_completion.choices[0].message.content, 'content': chat_completion.choices[0].message.content,
'timings': { 'timings': {
@ -1028,7 +1035,7 @@ async def oai_chat_completions(user_prompt,
return completion_response return completion_response
async def request_embedding(content, seed, base_url=None): async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/embedding', async with session.post(f'{base_url}/embedding',
json={ json={
@ -1041,7 +1048,7 @@ async def request_embedding(content, seed, base_url=None):
async def request_oai_embeddings(input, seed, async def request_oai_embeddings(input, seed,
base_url=None, user_api_key=None, base_url=None, user_api_key=None,
model=None, async_client=False): model=None, async_client=False) -> list[list[float]]:
# openai client always expects an api_key # openai client always expects an api_key
user_api_key = user_api_key if user_api_key is not None else 'nope' user_api_key = user_api_key if user_api_key is not None else 'nope'
if async_client: if async_client:
@ -1063,7 +1070,7 @@ async def request_oai_embeddings(input, seed,
response_json = await response.json() response_json = await response.json()
assert response_json['model'] == model, f"invalid model received: {response_json['model']}" assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
assert response_json['object'] == 'list' assert response_json['object'] == 'list'
if isinstance(input, collections.abc.Sequence): if isinstance(input, Sequence):
embeddings = [] embeddings = []
for an_oai_embeddings in response_json['data']: for an_oai_embeddings in response_json['data']:
embeddings.append(an_oai_embeddings['embedding']) embeddings.append(an_oai_embeddings['embedding'])
@ -1072,19 +1079,14 @@ async def request_oai_embeddings(input, seed,
return embeddings return embeddings
else: else:
openai.api_key = user_api_key openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1' openai.base_url = f'{base_url}/v1/'
oai_embeddings = openai.Embedding.create( assert model is not None
oai_embeddings = openai.embeddings.create(
model=model, model=model,
input=input, input=input,
) )
if isinstance(input, collections.abc.Sequence): return [e.embedding for e in oai_embeddings.data]
embeddings = []
for an_oai_embeddings in oai_embeddings.data:
embeddings.append(an_oai_embeddings.embedding)
else:
embeddings = [oai_embeddings.data.embedding]
return embeddings
def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
@ -1343,7 +1345,7 @@ def start_server_background(context):
} }
context.server_process = subprocess.Popen( context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]], [str(arg) for arg in [context.server_path, *server_args]],
**pkwargs) **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue]
def server_log(in_stream, out_stream): def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''): for line in iter(in_stream.readline, b''):

View file

@ -1,6 +1,6 @@
aiohttp~=3.9.3 aiohttp~=3.9.3
behave~=1.2.6 behave~=1.2.6
huggingface_hub~=0.20.3 huggingface_hub~=0.20.3
numpy~=1.24.4 numpy~=1.26.4
openai~=0.25.0 openai~=1.30.3
prometheus-client~=0.20.0 prometheus-client~=0.20.0

View file

@ -1,13 +1,15 @@
import asyncio import asyncio
import asyncio.threads
import requests import requests
import numpy as np import numpy as np
n = 8 n = 8
result = [] result = []
async def requests_post_async(*args, **kwargs): async def requests_post_async(*args, **kwargs):
return await asyncio.to_thread(requests.post, *args, **kwargs) return await asyncio.threads.to_thread(requests.post, *args, **kwargs)
async def main(): async def main():
model_url = "http://127.0.0.1:6900" model_url = "http://127.0.0.1:6900"

View file

@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n"); fprintf(stream, " --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n"); fprintf(stream, " --show-count print the total number of tokens.\n");
} }
@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
// variables where to put any arguments we see. // variables where to put any arguments we see.
bool printing_ids = false; bool printing_ids = false;
bool no_bos = false; bool no_bos = false;
bool no_parse_special = false;
bool disable_logging = false; bool disable_logging = false;
bool show_token_count = false; bool show_token_count = false;
const char * model_path = NULL; const char * model_path = NULL;
@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
else if (arg == "--no-bos") { else if (arg == "--no-bos") {
no_bos = true; no_bos = true;
} }
else if (arg == "--no-parse-special") {
no_parse_special = true;
}
else if (arg == "-p" || arg == "--prompt") { else if (arg == "-p" || arg == "--prompt") {
if (prompt_set) { if (prompt_set) {
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
const bool model_wants_add_bos = llama_should_add_bos_token(model); const bool model_wants_add_bos = llama_should_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos; const bool add_bos = model_wants_add_bos && !no_bos;
const bool parse_special = !no_parse_special;
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, true); tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
if (printing_ids) { if (printing_ids) {
printf("["); printf("[");

View file

@ -66,7 +66,7 @@ class Tensor:
if len(self.ne) == 0: if len(self.ne) == 0:
self.nbytes = 0 self.nbytes = 0
else: else:
self.nbytes = int(np.product(self.ne)) * 4 self.nbytes = int(np.prod(self.ne)) * 4
else: else:
raise ValueError(f"Unhandled data type '{self.dtype}'") raise ValueError(f"Unhandled data type '{self.dtype}'")

View file

@ -99,6 +99,8 @@ async def main():
tasks = [] tasks = []
base_dict = {"FLOAT_TYPE": "float"}
for fp16 in (False, True): for fp16 in (False, True):
# MUL_MAT # MUL_MAT
matmul_shaders(tasks, fp16, False) matmul_shaders(tasks, fp16, False)
@ -106,8 +108,6 @@ async def main():
matmul_shaders(tasks, fp16, True) matmul_shaders(tasks, fp16, True)
for tname in type_names: for tname in type_names:
base_dict = {"FLOAT_TYPE": "float"}
# mul mat vec # mul mat vec
data_a_key = f"DATA_A_{tname.upper()}" data_a_key = f"DATA_A_{tname.upper()}"
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp" shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"

View file

@ -390,6 +390,9 @@ extern "C" {
GGML_TYPE_F64 = 28, GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29, GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30, GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
@ -431,6 +434,9 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
}; };
// available tensor operations: // available tensor operations:
@ -2413,6 +2419,12 @@ extern "C" {
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc); const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
int64_t k, int64_t bx);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef struct { typedef struct {
const char * type_name; const char * type_name;
@ -2425,6 +2437,11 @@ extern "C" {
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously; int64_t nrows; // number of rows to process simultaneously;
int64_t ncols; // number of columns to process simultaneously;
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t; } ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

2187
ggml/src/ggml-aarch64.c Normal file

File diff suppressed because it is too large Load diff

39
ggml/src/ggml-aarch64.h Normal file
View file

@ -0,0 +1,39 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

View file

@ -199,6 +199,30 @@ typedef struct {
} block_q8_1; } block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q4_0 blocks
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
} block_q4_0x4;
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q4_0 blocks
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
} block_q4_0x8;
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
} block_q8_0x4;
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
typedef struct {
ggml_half d[8]; // deltas for 8 q8_0 blocks
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
} block_q8_0x8;
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
// //
// Super-block quantization structures // Super-block quantization structures
// //

View file

@ -31,6 +31,7 @@ bool g_mul_mat_q = false;
#include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh" #include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh" #include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -2265,6 +2266,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst); ggml_cuda_op_im2col(ctx, dst);
break; break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_cuda_op_conv_transpose_1d(ctx,dst);
break;
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
ggml_cuda_op_pool2d(ctx, dst); ggml_cuda_op_pool2d(ctx, dst);
break; break;
@ -2808,6 +2812,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
ggml_type src0_type = op->src[0]->type; ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break; } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
ggml_type src0_type = op->src[0]->type;
ggml_type src1_type = op->src[1]->type;
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return true;
}
return false;
} break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:

View file

@ -0,0 +1,87 @@
#include "conv-transpose-1d.cuh"
static __global__ void conv_transpose_1d_kernel(
const int s0, const int p0, const int d0, const int output_size,
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst) {
int global_index = threadIdx.x + blockIdx.x * blockDim.x;
if (global_index >= output_size) {
return;
}
int out_index = global_index / dst_ne0;
float accumulator = 0;
for (int c = 0; c < src0_ne2; c++) {
int idx = global_index % dst_ne0;
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
int input_offset = src1_ne0 * c;
for (int i = 0; i < src1_ne0; i++) {
if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
continue;
}
int weight_idx = idx - i*s0;
float kernel_weight = src0[kernel_offset + weight_idx];
float input_value = src1[input_offset+i];
accumulator += kernel_weight * input_value;
}
}
dst[global_index] = accumulator;
}
static void conv_transpose_1d_f32_f32_cuda(
const int s0, const int p0, const int d0, const int output_size,
const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
const float * src0, const float * src1, float * dst,
cudaStream_t stream) {
const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
s0,p0,d0,output_size,
src0_ne0, src0_ne1, src0_ne2, src0_ne3,
src1_ne0, src1_ne1, src1_ne2, src1_ne3,
dst_ne0, dst_ne1, dst_ne2, dst_ne3,
src0,src1, dst);
}
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
const ggml_tensor * src1 = dst->src[1];
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
const int32_t * opts = (const int32_t *)dst->op_params;
const int s0 = opts[0];
const int p0 = 0;//opts[3];
const int d0 = 1;//opts[4];
const int64_t kernel_size = ggml_nelements(src0);
const int64_t input_size = ggml_nelements(src1);
const int64_t output_size = ggml_nelements(dst);
conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
src0_d, src1_d, dst_d, stream);
}

View file

@ -0,0 +1,5 @@
#include "common.cuh"
#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
// precomputed f32 table for f16 (256 KB) // precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init() // defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16]; extern float ggml_table_f32_f16[1 << 16];

View file

@ -3815,6 +3815,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
} }
#endif #endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
@ -3851,7 +3852,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
} }
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
#elif defined(__ARM_NEON) return;
}
#endif
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -5423,6 +5427,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
} }
#endif #endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
if (svcntb() == QK8_0) {
svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv0 = svdup_n_f32(0.0f);
svfloat32_t sumv1 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f);
@ -5447,7 +5452,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
} }
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
#elif defined(__ARM_NEON) return;
}
#endif
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -14761,6 +14769,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
} \ } \
} }
#define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
const type * q = (const type *) (data); \
for (size_t i = 0; i < (nb); ++i) { \
for (size_t j = 0; j < (nr); ++j) { \
if (!validate_fp16(q[i].d[j], i)) { \
return false; \
} \
} \
}
bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) { bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
if (type < 0 || type >= GGML_TYPE_COUNT) { if (type < 0 || type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid type %d\n", __func__, type); fprintf(stderr, "%s: invalid type %d\n", __func__, type);
@ -14978,6 +14996,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{ {
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break; } break;
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
} break;
case GGML_TYPE_Q4_0_8_8:
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
} break;
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:

View file

@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX #endif // SYCL_USE_XMX
// mmvq path is faster in the CUDA backend.
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch // KQ single-batch
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);

View file

@ -346,4 +346,10 @@ inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr); return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
} }
// Helper for accessing pointers with no warnings
template <typename Tp, int dim>
static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
}
#endif // GGML_SYCL_COMMON_HPP #endif // GGML_SYCL_COMMON_HPP

View file

@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32),
sycl::range<3>(1, 1, 32)), sycl::range<3>(1, 1, 32)),
[=](sycl::nd_item<3> item_ct1) { [=](sycl::nd_item<3> item_ct1) {
dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1); dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
}); });
}); });
} }

View file

@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_0<need_check>( mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q4_0_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q4_0_acc_ct1),
tile_x_d_q4_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q4_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_0<need_check>( mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q4_0_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q4_0_acc_ct1),
tile_x_d_q4_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q4_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_1<need_check>( mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q4_1_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q4_1_acc_ct1),
tile_x_dm_q4_1_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q4_1_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_1<need_check>( mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q4_1_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q4_1_acc_ct1),
tile_x_dm_q4_1_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q4_1_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_0<need_check>( mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_0_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_0_acc_ct1),
tile_x_d_q5_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q5_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_0<need_check>( mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_0_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_0_acc_ct1),
tile_x_d_q5_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q5_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_1<need_check>( mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_1_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_1_acc_ct1),
tile_x_dm_q5_1_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q5_1_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_1<need_check>( mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_1_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_1_acc_ct1),
tile_x_dm_q5_1_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q5_1_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q8_0<need_check>( mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q8_0_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q8_0_acc_ct1),
tile_x_d_q8_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q8_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q8_0<need_check>( mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_qs_q8_0_acc_ct1.get_pointer(), get_pointer(tile_x_qs_q8_0_acc_ct1),
tile_x_d_q8_0_acc_ct1.get_pointer(), get_pointer(tile_x_d_q8_0_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q2_K<need_check>( mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q2_K_acc_ct1),
tile_x_dm_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q2_K_acc_ct1),
tile_x_sc_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q2_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q2_K<need_check>( mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q2_K_acc_ct1),
tile_x_dm_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q2_K_acc_ct1),
tile_x_sc_q2_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q2_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q3_K<need_check>( mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q3_K_acc_ct1),
tile_x_dm_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q3_K_acc_ct1),
tile_x_qh_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_qh_q3_K_acc_ct1),
tile_x_sc_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q3_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q3_K<need_check>( mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q3_K_acc_ct1),
tile_x_dm_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q3_K_acc_ct1),
tile_x_qh_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_qh_q3_K_acc_ct1),
tile_x_sc_q3_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q3_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_K<need_check>( mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q4_K_acc_ct1),
tile_x_dm_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q4_K_acc_ct1),
tile_x_sc_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q4_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q4_K<need_check>( mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q4_K_acc_ct1),
tile_x_dm_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q4_K_acc_ct1),
tile_x_sc_q4_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q4_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_K<need_check>( mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_K_acc_ct1),
tile_x_dm_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q5_K_acc_ct1),
tile_x_sc_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q5_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q5_K<need_check>( mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_ql_q5_K_acc_ct1),
tile_x_dm_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_dm_q5_K_acc_ct1),
tile_x_sc_q5_K_acc_ct1.get_pointer(), get_pointer(tile_x_sc_q5_K_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q6_K<need_check>( mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_acc_ct1.get_pointer(), get_pointer(tile_x_ql_acc_ct1),
tile_x_dm_acc_ct1.get_pointer(), get_pointer(tile_x_dm_acc_ct1),
tile_x_sc_acc_ct1.get_pointer(), get_pointer(tile_x_sc_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }
@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
mul_mat_q6_K<need_check>( mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1, nrows_dst, item_ct1,
tile_x_ql_acc_ct1.get_pointer(), get_pointer(tile_x_ql_acc_ct1),
tile_x_dm_acc_ct1.get_pointer(), get_pointer(tile_x_dm_acc_ct1),
tile_x_sc_acc_ct1.get_pointer(), get_pointer(tile_x_sc_acc_ct1),
tile_y_qs_acc_ct1.get_pointer(), get_pointer(tile_y_qs_acc_ct1),
tile_y_ds_acc_ct1.get_pointer()); get_pointer(tile_y_ds_acc_ct1));
}); });
}); });
} }

View file

@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] { [[intel::reqd_sub_group_size(WARP_SIZE)]] {
norm_f32(x, dst, ncols, eps, item_ct1, norm_f32(x, dst, ncols, eps, item_ct1,
s_sum_acc_ct1.get_pointer(), work_group_size); get_pointer(s_sum_acc_ct1), work_group_size);
}); });
}); });
} }
@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
[[intel::reqd_sub_group_size(WARP_SIZE)]] { [[intel::reqd_sub_group_size(WARP_SIZE)]] {
group_norm_f32(x, dst, group_size, ne_elements, group_norm_f32(x, dst, group_size, ne_elements,
eps_ct4, item_ct1, eps_ct4, item_ct1,
s_sum_acc_ct1.get_pointer(), work_group_size); get_pointer(s_sum_acc_ct1), work_group_size);
}); });
}); });
} }
@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
[=](sycl::nd_item<3> item_ct1) [=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] { [[intel::reqd_sub_group_size(WARP_SIZE)]] {
rms_norm_f32(x, dst, ncols, eps, item_ct1, rms_norm_f32(x, dst, ncols, eps, item_ct1,
s_sum_acc_ct1.get_pointer(), work_group_size); get_pointer(s_sum_acc_ct1), work_group_size);
}); });
}); });
} }

View file

@ -55,7 +55,7 @@ static void rope_norm(
const int i = row*ne0 + i0; const int i = row*ne0 + i0;
const int i2 = row/p_delta_rows; const int i2 = row/p_delta_rows;
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -98,7 +98,7 @@ static void rope_neox(
const int i = row*ne0 + i0/2; const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows; const int i2 = row/p_delta_rows;
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;

View file

@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par, soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
nrows_y, scale, max_bias, m0, nrows_y, scale, max_bias, m0,
m1, n_head_log2, item_ct1, m1, n_head_log2, item_ct1,
local_buf_acc.get_pointer()); get_pointer(local_buf_acc));
}); });
}); });
} }

View file

@ -4,7 +4,7 @@
#include "ggml-impl.h" #include "ggml-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-aarch64.h"
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
@ -37,12 +37,12 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#ifdef __ARM_FEATURE_MATMUL_INT8 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE #undef GGML_USE_LLAMAFILE
#endif #endif
#ifdef GGML_USE_LLAMAFILE #ifdef GGML_USE_LLAMAFILE
#include "sgemm.h" #include <llamafile/sgemm.h>
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
@ -700,6 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
#else #else
.nrows = 1, .nrows = 1,
#endif #endif
.from_float_to_mat = quantize_mat_q8_0,
}, },
[GGML_TYPE_Q8_1] = { [GGML_TYPE_Q8_1] = {
.type_name = "q8_1", .type_name = "q8_1",
@ -897,6 +898,54 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
.vec_dot_type = GGML_TYPE_BF16, .vec_dot_type = GGML_TYPE_BF16,
.nrows = 1, .nrows = 1,
},
[GGML_TYPE_Q4_0_4_4] = {
.type_name = "q4_0_4x4",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.interleave_blcksize = 4,
.gemv = ggml_gemv_q4_0_4x4_q8_0,
.gemm = ggml_gemm_q4_0_4x4_q8_0,
},
[GGML_TYPE_Q4_0_4_8] = {
.type_name = "q4_0_4x8",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_4x8_q8_0,
.gemm = ggml_gemm_q4_0_4x8_q8_0,
},
[GGML_TYPE_Q4_0_8_8] = {
.type_name = "q4_0_8x8",
.blck_size = QK4_0,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 8,
.interleave_blcksize = 8,
.gemv = ggml_gemv_q4_0_8x8_q8_0,
.gemm = ggml_gemm_q4_0_8x8_q8_0,
} }
}; };
@ -3208,6 +3257,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
} }
@ -9467,6 +9519,9 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_add_q_f32(params, dst); ggml_compute_forward_add_q_f32(params, dst);
} break; } break;
@ -9842,6 +9897,9 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_add1_q_f32(params, dst); ggml_compute_forward_add1_q_f32(params, dst);
} break; } break;
@ -9967,6 +10025,9 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -12180,6 +12241,12 @@ static void ggml_compute_forward_mul_mat(
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const vec_dot_num_rows = type_traits[type].nrows; int64_t const vec_dot_num_rows = type_traits[type].nrows;
int64_t const matmul_num_cols = type_traits[type].ncols;
int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
ggml_from_float_to_mat_t const from_float_to_mat
= type_traits[vec_dot_type].from_float_to_mat;
ggml_gemv_t const gemv = type_traits[type].gemv;
ggml_gemm_t const gemm = type_traits[type].gemm;
GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11); GGML_ASSERT(ne1 == ne11);
@ -12246,7 +12313,16 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = ith; i11 < ne11; i11 += nth) { int64_t i11_processed = 0;
if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
4, ne10, interleave_blcksize);
}
i11_processed = ne11 - ne11 % 4;
}
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
ne10); ne10);
@ -12327,6 +12403,28 @@ UseGgmlGemm2:;
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
if ((ggml_n_dims(src0) == 2) && gemv) {
const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
int64_t src0_start = (ith * ne01) / nth;
int64_t src0_end = ((ith + 1) * ne01) / nth;
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
if (src0_start >= src0_end) return;
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
if (gemm && (ne11 > 3)) {
gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
}
for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
src0_end - src0_start);
}
return;
}
// The first chunk comes from our thread_id, the rest will get auto-assigned. // The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith; int current_chunk = ith;
@ -12372,6 +12470,8 @@ static void ggml_compute_forward_mul_mat_id(
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
int64_t const matmul_num_cols = type_traits[type].ncols;
ggml_gemv_t const gemv = type_traits[type].gemv;
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb00 == ggml_type_size(type));
@ -12457,6 +12557,34 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t nr0 = ne01; // src0 rows const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows const int64_t nr1 = cne1; // src1 rows
if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
int64_t src0_cur_start = (ith * ne01) / nth;
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
if (src0_cur_start >= src0_cur_end) return;
for (int ir1 = 0; ir1 < nr1; ir1++) {
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
const int id = row_mapping.i1; // selected expert index
const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1
const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12 * ne11) * row_size
: (i11 * nb11 + i12 * nb12));
gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
(const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
}
continue;
}
// distribute the thread work across the inner or outer loop based on which one is larger // distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@ -12758,6 +12886,9 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_out_prod_q_f32(params, dst); ggml_compute_forward_out_prod_q_f32(params, dst);
} break; } break;
@ -12943,6 +13074,9 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
@ -13202,6 +13336,9 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_get_rows_q(params, dst); ggml_compute_forward_get_rows_q(params, dst);
} break; } break;
@ -13788,6 +13925,9 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:
@ -20516,6 +20656,9 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
size_t elemsize = sizeof(ggml_fp16_t); size_t elemsize = sizeof(ggml_fp16_t);
@ -21862,8 +22005,6 @@ int ggml_cpu_has_neon(void) {
int ggml_cpu_has_sve(void) { int ggml_cpu_has_sve(void) {
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
// TODO: Currently, SVE 256 bit is only supported.
GGML_ASSERT(svcntb() == QK8_0);
return 1; return 1;
#else #else
return 0; return 0;

View file

@ -79,5 +79,4 @@ python -m twine upload dist/*
``` ```
## TODO ## TODO
- [ ] Add tests
- [ ] Include conversion scripts as command line entry points in this package. - [ ] Include conversion scripts as command line entry points in this package.

View file

@ -120,7 +120,6 @@ class Keys:
MIDDLE_ID = "tokenizer.ggml.middle_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id" EOT_ID = "tokenizer.ggml.eot_token_id"
# #
# recommended mapping of model tensor names for storage in gguf # recommended mapping of model tensor names for storage in gguf
# #
@ -163,6 +162,7 @@ class MODEL_ARCH(IntEnum):
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
CHATGLM = auto()
BITNET = auto() BITNET = auto()
T5 = auto() T5 = auto()
JAIS = auto() JAIS = auto()
@ -289,6 +289,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5", MODEL_ARCH.T5: "t5",
MODEL_ARCH.JAIS: "jais", MODEL_ARCH.JAIS: "jais",
@ -924,6 +925,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP,
], ],
MODEL_ARCH.CHATGLM : [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.BITNET: [ MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
@ -1020,6 +1033,9 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.CHATGLM: [
MODEL_TENSOR.ROPE_FREQS,
],
} }
# #

View file

@ -67,7 +67,7 @@ class ReaderTensor(NamedTuple):
class GGUFReader: class GGUFReader:
# I - same as host, S - swapped # I - same as host, S - swapped
byte_order: Literal['I'] | Literal['S'] = 'I' byte_order: Literal['I', 'S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT alignment: int = GGUF_DEFAULT_ALIGNMENT
data_offset: int data_offset: int
@ -86,7 +86,7 @@ class GGUFReader:
GGUFValueType.BOOL: np.bool_, GGUFValueType.BOOL: np.bool_,
} }
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'): def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
self.data = np.memmap(path, mode = mode) self.data = np.memmap(path, mode = mode)
offs = 0 offs = 0
@ -140,7 +140,7 @@ class GGUFReader:
return self.tensors[idx] return self.tensors[idx]
def _get( def _get(
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None, self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
) -> npt.NDArray[Any]: ) -> npt.NDArray[Any]:
count = int(count) count = int(count)
itemsize = int(np.empty([], dtype = dtype).itemsize) itemsize = int(np.empty([], dtype = dtype).itemsize)

View file

@ -6,7 +6,6 @@ from typing import Any, Callable
from collections import deque from collections import deque
import numpy as np import numpy as np
from numpy._typing import _Shape
from numpy.typing import DTypeLike from numpy.typing import DTypeLike
@ -16,16 +15,16 @@ logger = logging.getLogger(__name__)
class LazyMeta(ABCMeta): class LazyMeta(ABCMeta):
def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs): def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
def __getattr__(self, __name: str) -> Any: def __getattr__(self, name: str) -> Any:
meta_attr = getattr(self._meta, __name) meta_attr = getattr(self._meta, name)
if callable(meta_attr): if callable(meta_attr):
return type(self)._wrap_fn( return type(self)._wrap_fn(
(lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)), (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
use_self=self, use_self=self,
) )
elif isinstance(meta_attr, self._tensor_type): elif isinstance(meta_attr, self._tensor_type):
# e.g. self.T with torch.Tensor should still be wrapped # e.g. self.T with torch.Tensor should still be wrapped
return type(self)._wrap_fn(lambda s: getattr(s, __name))(self) return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
else: else:
# no need to wrap non-tensor properties, # no need to wrap non-tensor properties,
# and they likely don't depend on the actual contents of the tensor # and they likely don't depend on the actual contents of the tensor
@ -141,19 +140,21 @@ class LazyBase(ABC, metaclass=LazyMeta):
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
if isinstance(res, cls._tensor_type): if isinstance(res, cls._tensor_type):
def collect_replace(t: LazyBase): class CollectSharedLazy:
if collect_replace.shared_lazy is None:
collect_replace.shared_lazy = t._lazy
else:
collect_replace.shared_lazy.extend(t._lazy)
t._lazy = collect_replace.shared_lazy
# emulating a static variable # emulating a static variable
collect_replace.shared_lazy = None shared_lazy: None | deque[LazyBase] = None
LazyBase._recurse_apply(args, collect_replace) @staticmethod
def collect_replace(t: LazyBase):
if CollectSharedLazy.shared_lazy is None:
CollectSharedLazy.shared_lazy = t._lazy
else:
CollectSharedLazy.shared_lazy.extend(t._lazy)
t._lazy = CollectSharedLazy.shared_lazy
shared_lazy = collect_replace.shared_lazy LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace)
shared_lazy = CollectSharedLazy.shared_lazy
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs)) return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
else: else:
@ -184,6 +185,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager) lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
lt._data = lt._func(lt._args) lt._data = lt._func(lt._args)
# sanity check # sanity check
assert lt._data is not None
assert lt._data.dtype == lt._meta.dtype assert lt._data.dtype == lt._meta.dtype
assert lt._data.shape == lt._meta.shape assert lt._data.shape == lt._meta.shape
@ -216,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray _tensor_type = np.ndarray
@classmethod @classmethod
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]: def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
# The initial idea was to use np.nan as the fill value, # The initial idea was to use np.nan as the fill value,
# but non-float types like np.int16 can't use that. # but non-float types like np.int16 can't use that.
# So zero it is. # So zero it is.

View file

@ -24,6 +24,7 @@ class TensorNameMap:
"backbone.embedding", # mamba "backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
"embedding.word_embeddings", # chatglm
"transformer.token_embeddings", # openelm "transformer.token_embeddings", # openelm
"shared", # t5 "shared", # t5
), ),
@ -55,6 +56,7 @@ class TensorNameMap:
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
"output_layer", # chatglm
), ),
# Output norm # Output norm
@ -71,12 +73,14 @@ class TensorNameMap:
"model.norm_f", # mamba-qbert "model.norm_f", # mamba-qbert
"backbone.norm_f", # mamba "backbone.norm_f", # mamba
"transformer.rms_norm", # Grok "transformer.rms_norm", # Grok
"encoder.final_layernorm", # chatglm
"transformer.norm", # openelm "transformer.norm", # openelm
), ),
# Rope frequencies # Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", # llama-pth "rope.freqs", # llama-pth
"rotary_pos_emb.inv_freq", # chatglm
), ),
} }
@ -101,6 +105,7 @@ class TensorNameMap:
"backbone.layers.{bid}.norm", # mamba "backbone.layers.{bid}.norm", # mamba
"transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"encoder.layers.{bid}.input_layernorm", # chatglm
"transformer.layers.{bid}.attn_norm", # openelm "transformer.layers.{bid}.attn_norm", # openelm
), ),
@ -124,6 +129,7 @@ class TensorNameMap:
"transformer.h.{bid}.mixer.Wqkv", # phi2 "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
"model.layers.{bid}.self_attn.qkv_proj", # phi3 "model.layers.{bid}.self_attn.qkv_proj", # phi3
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
"transformer.layers.{bid}.attn.qkv_proj", # openelm "transformer.layers.{bid}.attn.qkv_proj", # openelm
), ),
@ -135,7 +141,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2 "model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
), ),
# Attention key # Attention key
@ -147,7 +153,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.k", # refact "transformer.h.{bid}.attn.k", # refact
"model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2 "model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
), ),
# Attention value # Attention value
@ -182,6 +188,7 @@ class TensorNameMap:
"encoder.layers.{bid}.attn.out_proj", # nomic-bert "encoder.layers.{bid}.attn.out_proj", # nomic-bert
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
"encoder.layers.{bid}.self_attention.dense", # chatglm
"transformer.layers.{bid}.attn.out_proj", # openelm "transformer.layers.{bid}.attn.out_proj", # openelm
), ),
@ -218,6 +225,7 @@ class TensorNameMap:
"h.{bid}.ln_2", # gpt2 "h.{bid}.ln_2", # gpt2
"model.layers.{bid}.ffn_norm", # internlm2 "model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
"transformer.layers.{bid}.ffn_norm", # openelm "transformer.layers.{bid}.ffn_norm", # openelm
), ),
@ -268,6 +276,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.c_fc", # starcoder2 "model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
"model.layers.{bid}.residual_mlp.w3", # arctic "model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
@ -337,6 +346,7 @@ class TensorNameMap:
"transformer.layers.{bid}.ffn.proj_2", # openelm "transformer.layers.{bid}.ffn.proj_2", # openelm
"model.layers.{bid}.residual_mlp.w2", # arctic "model.layers.{bid}.residual_mlp.w2", # arctic
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.9.0" version = "0.9.1"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

View file

@ -1,3 +1,5 @@
# pyright: reportUnusedImport=false
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
from .gguf_dump import main as gguf_dump_entrypoint from .gguf_dump import main as gguf_dump_entrypoint
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint from .gguf_set_metadata import main as gguf_set_metadata_entrypoint

91
gguf-py/scripts/gguf_hash.py Executable file
View file

@ -0,0 +1,91 @@
#!/usr/bin/env python3
from __future__ import annotations
import uuid
import hashlib
import logging
import argparse
import os
import sys
from pathlib import Path
from tqdm import tqdm
# Necessary to load the local gguf package
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
sys.path.insert(0, str(Path(__file__).parent.parent))
from gguf import GGUFReader # noqa: E402
logger = logging.getLogger("gguf-hash")
# UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
# For more information about what field.parts and field.data represent,
# please see the comments in the modify_gguf.py example.
def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
sha1 = hashlib.sha1()
uuidv5_sha1 = hashlib.sha1()
uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
# Total Weight Calculation For Progress Bar
total_weights = 0
for n, tensor in enumerate(reader.tensors, 1):
# We don't need these
if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
# Calculate Tensor Volume
sum_weights_in_tensor = 1
for dim in tensor.shape:
sum_weights_in_tensor *= dim
total_weights += sum_weights_in_tensor
# Hash Progress Bar
bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar)
# Hashing Process
for n, tensor in enumerate(reader.tensors, 1):
# We don't need these
if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
# Progressbar
sum_weights_in_tensor = 1
for dim in tensor.shape:
sum_weights_in_tensor *= dim
bar.update(sum_weights_in_tensor)
sha1_layer = hashlib.sha1()
sha1_layer.update(tensor.data.data)
sha1.update(tensor.data.data)
uuidv5_sha1.update(tensor.data.data)
print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
# Flush Hash Progress Bar
bar.close()
# Display Hash Output
print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
print("UUIDv5 {0} {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
def main() -> None:
parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
parser.add_argument("model", type=str, help="GGUF format model filename")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--progressbar", action="store_true", help="enable progressbar")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
reader = GGUFReader(args.model, 'r')
gguf_hash(reader, args.model, not args.progressbar)
if __name__ == '__main__':
main()

View file

@ -1,4 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations
import logging import logging
import argparse import argparse
import os import os

View file

@ -1,4 +1,4 @@
import gguf # noqa: F401 import gguf # noqa: F401 # pyright: ignore[reportUnusedImport]
# TODO: add tests # TODO: add tests

View file

@ -88,8 +88,10 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_VIKING = 16, LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_JAIS = 17, LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
}; };
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope
@ -160,6 +162,9 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };

View file

@ -1,3 +1,21 @@
{ {
"extraPaths": ["gguf-py"], "extraPaths": ["gguf-py"],
"pythonVersion": "3.9",
"pythonPlatform": "All",
"reportUnusedImport": "warning",
"reportDuplicateImport": "error",
"reportDeprecated": "warning",
"reportUnnecessaryTypeIgnoreComment": "warning",
"executionEnvironments": [
{
// TODO: make this version override work correctly
"root": "gguf-py",
"pythonVersion": "3.8",
},
{
// uses match expressions in steps.py
"root": "examples/server/tests",
"pythonVersion": "3.10",
},
],
} }

View file

@ -0,0 +1,12 @@
-r ../examples/llava/requirements.txt
-r ../examples/server/bench/requirements.txt
-r ../examples/server/tests/requirements.txt
-r ./requirements-compare-llama-bench.txt
-r ./requirements-pydantic.txt
-r ./requirements-test-tokenizer-random.txt
-r ./requirements-convert_hf_to_gguf.txt
-r ./requirements-convert_hf_to_gguf_update.txt
-r ./requirements-convert_legacy_llama.txt
-r ./requirements-convert_llama_ggml_to_gguf.txt

View file

@ -0,0 +1,2 @@
tabulate~=0.9.0
GitPython~=3.1.43

View file

@ -0,0 +1,2 @@
docstring_parser~=0.15
pydantic~=2.6.3

View file

@ -0,0 +1 @@
cffi~=1.16.0

View file

@ -62,6 +62,12 @@
#include <io.h> #include <io.h>
#endif #endif
#if __cplusplus >= 202000L
#define LU8(x) (const char*)(u8##x)
#else
#define LU8(x) u8##x
#endif
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <cassert> #include <cassert>
@ -253,6 +259,7 @@ enum llm_arch {
LLM_ARCH_OPENELM, LLM_ARCH_OPENELM,
LLM_ARCH_ARCTIC, LLM_ARCH_ARCTIC,
LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM,
LLM_ARCH_BITNET, LLM_ARCH_BITNET,
LLM_ARCH_T5, LLM_ARCH_T5,
LLM_ARCH_JAIS, LLM_ARCH_JAIS,
@ -296,6 +303,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5, "t5" },
{ LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_JAIS, "jais" },
@ -1229,6 +1237,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
}, },
}, },
{
LLM_ARCH_CHATGLM,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
{ {
LLM_ARCH_BITNET, LLM_ARCH_BITNET,
{ {
@ -2115,9 +2138,11 @@ enum e_model {
MODEL_2_8B, MODEL_2_8B,
MODEL_3B, MODEL_3B,
MODEL_4B, MODEL_4B,
MODEL_6B,
MODEL_6_9B, MODEL_6_9B,
MODEL_7B, MODEL_7B,
MODEL_8B, MODEL_8B,
MODEL_9B,
MODEL_11B, MODEL_11B,
MODEL_12B, MODEL_12B,
MODEL_13B, MODEL_13B,
@ -2143,7 +2168,6 @@ enum e_model {
MODEL_16x12B, MODEL_16x12B,
MODEL_10B_128x3_66B, MODEL_10B_128x3_66B,
MODEL_57B_A14B, MODEL_57B_A14B,
MODEL_9B,
MODEL_27B, MODEL_27B,
}; };
@ -3804,6 +3828,9 @@ struct llama_model_loader {
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
default: default:
{ {
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -4510,6 +4537,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
@ -4543,9 +4573,11 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_2_8B: return "2.8B"; case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B"; case MODEL_3B: return "3B";
case MODEL_4B: return "4B"; case MODEL_4B: return "4B";
case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B"; case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B"; case MODEL_7B: return "7B";
case MODEL_8B: return "8B"; case MODEL_8B: return "8B";
case MODEL_9B: return "9B";
case MODEL_11B: return "11B"; case MODEL_11B: return "11B";
case MODEL_12B: return "12B"; case MODEL_12B: return "12B";
case MODEL_13B: return "13B"; case MODEL_13B: return "13B";
@ -4571,7 +4603,6 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_16x12B: return "16x12B"; case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B"; case MODEL_57B_A14B: return "57B.A14B";
case MODEL_9B: return "9B";
case MODEL_27B: return "27B"; case MODEL_27B: return "27B";
default: return "?B"; default: return "?B";
} }
@ -4678,16 +4709,6 @@ static void llm_load_hparams(
// non-transformer models do not have attention heads // non-transformer models do not have attention heads
if (hparams.n_head() > 0) { if (hparams.n_head() > 0) {
// sanity check for n_rot (optional)
hparams.n_rot = hparams.n_embd / hparams.n_head();
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd / hparams.n_head()) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head()));
}
}
// gpt-neox n_rot = rotary_pct * (n_embd / n_head) // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
// gpt-j n_rot = rotary_dim // gpt-j n_rot = rotary_dim
@ -4696,6 +4717,17 @@ static void llm_load_hparams(
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head(); hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
// sanity check for n_rot (optional)
hparams.n_rot = hparams.n_embd_head_k;
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
}
} else { } else {
hparams.n_rot = 0; hparams.n_rot = 0;
hparams.n_embd_head_k = 0; hparams.n_embd_head_k = 0;
@ -5176,6 +5208,15 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_CHATGLM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: model.type = e_model::MODEL_6B; break;
case 40: model.type = e_model::MODEL_9B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_BITNET: case LLM_ARCH_BITNET:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5309,9 +5350,7 @@ static void llm_load_vocab(
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n"); throw std::runtime_error("cannot find tokenizer merges in model file\n");
} }
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) { for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
if (!OldBPETokenizerMode) if (!OldBPETokenizerMode)
@ -5463,6 +5502,10 @@ static void llm_load_vocab(
tokenizer_pre == "poro-chat") { tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "chatglm-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
vocab.special_bos_id = -1;
} else if ( } else if (
tokenizer_pre == "viking") { tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
@ -5596,7 +5639,6 @@ static void llm_load_vocab(
vocab.special_eot_id = 107; vocab.special_eot_id = 107;
} }
} }
try { try {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} catch (const std::exception & e) { } catch (const std::exception & e) {
@ -7513,6 +7555,36 @@ static bool llm_load_tensors(
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
} }
} break; } break;
case LLM_ARCH_CHATGLM:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + (hparams.n_embd_head_k << 2)});
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + (hparams.n_embd_head_k << 2)});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
}
} break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
} }
@ -7737,6 +7809,7 @@ enum llm_ffn_op_type {
LLM_FFN_GELU, LLM_FFN_GELU,
LLM_FFN_RELU, LLM_FFN_RELU,
LLM_FFN_RELU_SQR, LLM_FFN_RELU_SQR,
LLM_FFN_SWIGLU,
}; };
enum llm_ffn_gate_type { enum llm_ffn_gate_type {
@ -7941,6 +8014,19 @@ static struct ggml_tensor * llm_build_ffn(
cur = ggml_sqr(ctx, cur); cur = ggml_sqr(ctx, cur);
cb(cur, "ffn_sqr(relu)", il); cb(cur, "ffn_sqr(relu)", il);
} break; } break;
case LLM_FFN_SWIGLU:
{
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
int64_t split_point = cur->ne[0] / 2;
struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
x0 = ggml_silu(ctx, x0);
cb(cur, "ffn_silu", il);
cur = ggml_mul(ctx, x0, x1);
cb(cur, "ffn_mul", il);
} break;
} }
if (type_gate == LLM_FFN_PAR) { if (type_gate == LLM_FFN_PAR) {
@ -8129,7 +8215,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il); cb(kq, "kq", il);
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
ggml_mul_mat_set_prec(kq, GGML_PREC_F32); ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@ -10789,19 +10875,12 @@ struct llm_build_context {
// special-case: the up and gate tensors are merged into a single tensor // special-case: the up and gate tensors are merged into a single tensor
// TOOD: support into llm_build_ffn // TOOD: support into llm_build_ffn
{ {
struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); cur = llm_build_ffn(ctx0, cur,
cb(up, "ffn_up", il); model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0)); model.layers[il].ffn_down, NULL, NULL,
auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2)); NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
cb(y, "ffn_gate", il);
auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
cb(down, "ffn_down", il);
cur = down;
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} }
@ -11571,7 +11650,7 @@ struct llm_build_context {
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -11580,7 +11659,7 @@ struct llm_build_context {
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -11684,7 +11763,7 @@ struct llm_build_context {
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -11693,7 +11772,7 @@ struct llm_build_context {
Kcur = ggml_rope_ext( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -13214,6 +13293,8 @@ struct llm_build_context {
LLM_NORM_RMS, cb, -1); LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
} else { } else {
GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
@ -13493,6 +13574,120 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_chatglm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm,
NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
struct ggml_tensor * Qcur = nullptr;
struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr;
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
cb(cur, "bqkv", il);
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur_rope", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur_rope", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// Add the input
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// FF
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm,
NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
inpL = ggml_add(ctx0, cur, ffn_inp);
cb(inpL, "l_out", il);
}
cur = llm_build_norm(ctx0, inpL, hparams,
model.output_norm,
NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
}; };
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) { static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -13724,6 +13919,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_deepseek2(); result = llm.build_deepseek2();
} break; } break;
case LLM_ARCH_CHATGLM:
{
result = llm.build_chatglm();
} break;
case LLM_ARCH_BITNET: case LLM_ARCH_BITNET:
{ {
result = llm.build_bitnet(); result = llm.build_bitnet();
@ -15560,6 +15759,11 @@ struct llm_tokenizer_bpe {
" ?[^(\\s|.,!?…。,、।۔،)]+", " ?[^(\\s|.,!?…。,、।۔،)]+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
regex_exprs = {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_VIKING: case LLAMA_VOCAB_PRE_TYPE_VIKING:
regex_exprs = { regex_exprs = {
"\\p{N}", "\\p{N}",
@ -16488,7 +16692,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
if (add_special) { if (add_special) {
tokenizer.append_bos(output); tokenizer.append_bos(output);
} }
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -17915,6 +18118,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S; new_type = GGML_TYPE_IQ3_S;
} }
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0;
}
} }
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@ -18227,6 +18434,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
} }
@ -18537,6 +18747,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data(); f32_data = (float *) f32_conv_buf.data();
} }
int chunk_size_multiplier = 1;
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout); fflush(stdout);
@ -18549,7 +18767,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const int64_t nrows = tensor->ne[1]; const int64_t nrows = tensor->ne[1];
static const int64_t min_chunk_size = 32 * 512; static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
chunk_size_multiplier;
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@ -19482,6 +19701,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_OLMO: case LLM_ARCH_OLMO:
case LLM_ARCH_ARCTIC: case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM:
return LLAMA_ROPE_TYPE_NORM; return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2 // the pairs of head values are offset by n_rot/2
@ -21225,7 +21445,6 @@ int32_t llama_tokenize(
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special); auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
if (n_tokens_max < (int) res.size()) { if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size()); return -((int) res.size());
@ -21650,12 +21869,31 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
} }
} else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) { } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
ss << "[gMASK]" << "sop";
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n " << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
if (role == "user") { if (role == "user") {
ss << u8"<用户>"; ss << LU8("<用户>");
ss << trim(message->content); ss << trim(message->content);
ss << "<AI>"; ss << "<AI>";
} else { } else {
@ -21671,7 +21909,7 @@ static int32_t llama_chat_apply_template_internal(
} else if (role == "user") { } else if (role == "user") {
ss << "User: " << message->content << "\n\n"; ss << "User: " << message->content << "\n\n";
} else if (role == "assistant") { } else if (role == "assistant") {
ss << "Assistant: " << message->content << u8"<end▁of▁sentence>"; ss << "Assistant: " << message->content << LU8("<end▁of▁sentence>");
} }
} }
if (add_ass) { if (add_ass) {

View file

@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"