mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ci/run.sh # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/concat.cpp # ggml/src/ggml-sycl/conv.cpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/getrows.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/gla.cpp # ggml/src/ggml-sycl/im2col.cpp # ggml/src/ggml-sycl/mmq.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-sycl/wkv.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
4f2fcaa2ef
21 changed files with 443 additions and 81 deletions
|
@ -2708,6 +2708,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.embd_sep = value;
|
params.embd_sep = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cls-separator"}, "STRING",
|
||||||
|
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.cls_sep = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--host"}, "HOST",
|
{"--host"}, "HOST",
|
||||||
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
||||||
|
|
|
@ -1298,6 +1298,9 @@ std::vector<llama_token> common_tokenize(
|
||||||
int n_tokens = text.length() + 2 * add_special;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<llama_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
|
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
||||||
|
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
||||||
|
}
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
|
|
|
@ -354,6 +354,7 @@ struct common_params {
|
||||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
std::string embd_sep = "\n"; // separator of embeddings
|
std::string embd_sep = "\n"; // separator of embeddings
|
||||||
|
std::string cls_sep = "\t"; // separator of classification sequences
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
|
|
@ -2145,7 +2145,6 @@ class Llama4Model(LlamaModel):
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
@ -3918,9 +3917,6 @@ class BertModel(TextModel):
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
||||||
class DistilBertModel(BertModel):
|
class DistilBertModel(BertModel):
|
||||||
|
@ -3962,8 +3958,6 @@ class RobertaModel(BertModel):
|
||||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||||
if bpe_tok_path.exists():
|
if bpe_tok_path.exists():
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
# we need this to validate the size of the token_type embeddings
|
# we need this to validate the size of the token_type embeddings
|
||||||
# though currently we are passing all zeros to the token_type embeddings
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
|
@ -4848,8 +4842,6 @@ class JinaBertV2Model(BertModel):
|
||||||
self.gguf_writer.add_token_type_count(2)
|
self.gguf_writer.add_token_type_count(2)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("OpenELMForCausalLM")
|
@ModelBase.register("OpenELMForCausalLM")
|
||||||
|
@ -5451,9 +5443,6 @@ class T5Model(TextModel):
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
||||||
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
||||||
|
@ -5591,9 +5580,6 @@ class T5EncoderModel(TextModel):
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
|
||||||
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
|
||||||
|
|
|
@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
|
||||||
|
// get added sep and eos token, if any
|
||||||
|
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
|
||||||
|
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
|
||||||
|
|
||||||
// tokenize the prompts and trim
|
// tokenize the prompts and trim
|
||||||
std::vector<std::vector<int32_t>> inputs;
|
std::vector<std::vector<int32_t>> inputs;
|
||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = common_tokenize(ctx, prompt, true, true);
|
std::vector<llama_token> inp;
|
||||||
|
|
||||||
|
// split classification pairs and insert expected separator tokens
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
|
||||||
|
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
|
||||||
|
std::string final_prompt;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < pairs.size(); i++) {
|
||||||
|
final_prompt += pairs[i];
|
||||||
|
if (i != pairs.size() - 1) {
|
||||||
|
if (!added_eos_token.empty()) {
|
||||||
|
final_prompt += added_eos_token;
|
||||||
|
}
|
||||||
|
if (!added_sep_token.empty()) {
|
||||||
|
final_prompt += added_sep_token;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||||
|
} else {
|
||||||
|
inp = common_tokenize(ctx, prompt, true, true);
|
||||||
|
}
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
|
@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the last token is SEP
|
// check if the last token is SEP/EOS
|
||||||
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
|
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
|
||||||
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
|
||||||
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
82
ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
Normal file
82
ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
# include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
||||||
|
|
||||||
|
#if defined(__linux__)
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
struct powerpc_features {
|
||||||
|
std::string platform = "";
|
||||||
|
int power_version = -1;
|
||||||
|
|
||||||
|
bool has_vsx = false;
|
||||||
|
|
||||||
|
powerpc_features() {
|
||||||
|
#if defined(__linux__)
|
||||||
|
unsigned long auxval = getauxval(AT_PLATFORM);
|
||||||
|
if (auxval) {
|
||||||
|
platform = std::string(reinterpret_cast<const char*>(auxval));
|
||||||
|
// TBD: Do systems exist that return this in uppercase?
|
||||||
|
if (platform.substr(0, 5) == "power") {
|
||||||
|
// Extractt a numeric suffix, if one exists
|
||||||
|
int vpos = -1;
|
||||||
|
for (int i = platform.length() - 1; i >= 0; i--) {
|
||||||
|
if (std::isdigit(platform[i])) {
|
||||||
|
vpos = i;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vpos > -1) {
|
||||||
|
power_version = std::stoi(platform.substr(vpos));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (power_version >= 9) {
|
||||||
|
has_vsx = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static int ggml_backend_cpu_powerpc_score() {
|
||||||
|
int score = 1;
|
||||||
|
powerpc_features pf;
|
||||||
|
|
||||||
|
// Platform scores
|
||||||
|
#if defined(GGML_USE_POWER7)
|
||||||
|
if (pf.power_version < 7) { return 0; }
|
||||||
|
score += 1<<1;
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_POWER8)
|
||||||
|
if (pf.power_version < 8) { return 0; }
|
||||||
|
score += 1<<2;
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_POWER9)
|
||||||
|
if (pf.power_version < 9) { return 0; }
|
||||||
|
score += 1<<3;
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_POWER10)
|
||||||
|
if (pf.power_version < 10) { return 0; }
|
||||||
|
score += 1<<4;
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_POWER11)
|
||||||
|
if (pf.power_version < 11) { return 0; }
|
||||||
|
score += 1<<5;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Feature scores
|
||||||
|
#if defined(GGML_USE_VSX)
|
||||||
|
if (!pf.has_vsx) { return 0; }
|
||||||
|
score += 1<<6;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score)
|
||||||
|
|
||||||
|
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
|
@ -1180,13 +1180,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||||
// not realy a GGML_TYPE_Q8_0 but same size.
|
// not realy a GGML_TYPE_Q8_0 but same size.
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
|
{
|
||||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
{
|
||||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
||||||
size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
|
|
||||||
|
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
||||||
|
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
||||||
|
|
||||||
|
const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
|
||||||
|
|
||||||
|
size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
// GGML_ABORT("fatal error");
|
// GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
|
@ -1322,13 +1333,16 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||||
int32_t i2;
|
int32_t i2;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
|
GGML_ASSERT(params->wsize >=
|
||||||
n_as * ne12 * sizeof(mmid_row_mapping)));
|
(GGML_PAD(nbw3, sizeof(int64_t)) +
|
||||||
|
n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
|
||||||
|
);
|
||||||
|
|
||||||
auto * wdata = (char *) params->wdata;
|
auto * wdata = (char *)params->wdata;
|
||||||
auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
||||||
|
|
||||||
|
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
|
||||||
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||||
|
|
||||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
||||||
|
|
||||||
// src1: float32 => param type
|
// src1: float32 => param type
|
||||||
|
@ -1414,15 +1428,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// instance for Q4
|
|
||||||
static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
||||||
static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
||||||
static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
||||||
static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
||||||
|
|
||||||
// instance for IQ4
|
|
||||||
static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
||||||
|
|
||||||
} // namespace ggml::cpu::repack
|
} // namespace ggml::cpu::repack
|
||||||
|
|
||||||
static void flag_aarch_prepacked_quant(int type)
|
static void flag_aarch_prepacked_quant(int type)
|
||||||
|
@ -1435,55 +1440,65 @@ static void flag_aarch_prepacked_quant(int type)
|
||||||
}
|
}
|
||||||
|
|
||||||
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
||||||
|
|
||||||
|
// instance for Q4
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||||
|
|
||||||
|
// instance for IQ4
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||||
|
|
||||||
if (cur->type == GGML_TYPE_Q4_0) {
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
//we shall just use the regular avx2 handling, no repacking
|
//we shall just use the regular avx2 handling, no repacking
|
||||||
if (/*ggml_cpu_has_avx2() ||*/ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
if (/*ggml_cpu_has_avx2() ||*/ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||||
if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
return &ggml::cpu::repack::q4_0_8x8_q8_0;
|
return &q4_0_8x8_q8_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
if (cur->ne[1] % 4 == 0) {
|
if (cur->ne[1] % 4 == 0) {
|
||||||
return &ggml::cpu::repack::q4_0_4x8_q8_0;
|
return &q4_0_4x8_q8_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
if (cur->ne[1] % 4 == 0) {
|
if (cur->ne[1] % 4 == 0) {
|
||||||
return &ggml::cpu::repack::q4_0_4x4_q8_0;
|
return &q4_0_4x4_q8_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cur->type == GGML_TYPE_Q4_K) {
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
||||||
// if (ggml_cpu_has_avx2()) { //we shall just use the regular avx2 handling, no repacking otherwise massive slowdown with gpu
|
// if (ggml_cpu_has_avx2()) {
|
||||||
// if (cur->ne[1] % 8 == 0) {
|
// if (cur->ne[1] % 8 == 0) {
|
||||||
// return &ggml::cpu::aarch64::q4_K_8x8_q8_K;
|
// return &q4_K_8x8_q8_K;
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
if (cur->ne[1] % 4 == 0) {
|
if (cur->ne[1] % 4 == 0) {
|
||||||
return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
|
return &iq4_nl_4x4_q8_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (cur->type == GGML_TYPE_Q4_0_4_4) //kcpp backport old quant support
|
else if (cur->type == GGML_TYPE_Q4_0_4_4) //kcpp backport old quant support
|
||||||
{
|
{
|
||||||
flag_aarch_prepacked_quant(cur->type);
|
flag_aarch_prepacked_quant(cur->type);
|
||||||
return &ggml::cpu::repack::q4_0_4x4_q8_0;
|
return &q4_0_4x4_q8_0;
|
||||||
}
|
}
|
||||||
else if (cur->type == GGML_TYPE_Q4_0_4_8)
|
else if (cur->type == GGML_TYPE_Q4_0_4_8)
|
||||||
{
|
{
|
||||||
flag_aarch_prepacked_quant(cur->type);
|
flag_aarch_prepacked_quant(cur->type);
|
||||||
return &ggml::cpu::repack::q4_0_4x8_q8_0;
|
return &q4_0_4x8_q8_0;
|
||||||
}
|
}
|
||||||
else if (cur->type == GGML_TYPE_Q4_0_8_8)
|
else if (cur->type == GGML_TYPE_Q4_0_8_8)
|
||||||
{
|
{
|
||||||
flag_aarch_prepacked_quant(cur->type);
|
flag_aarch_prepacked_quant(cur->type);
|
||||||
return &ggml::cpu::repack::q4_0_8x8_q8_0;
|
return &q4_0_8x8_q8_0;
|
||||||
}
|
}
|
||||||
else if (cur->type == GGML_TYPE_IQ4_NL)
|
else if (cur->type == GGML_TYPE_IQ4_NL)
|
||||||
{
|
{
|
||||||
flag_aarch_prepacked_quant(cur->type);
|
flag_aarch_prepacked_quant(cur->type);
|
||||||
return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
|
return &iq4_nl_4x4_q8_0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -19,10 +19,10 @@
|
||||||
#endif
|
#endif
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -771,21 +771,7 @@ struct ggml_backend_cuda_context {
|
||||||
name(GGML_CUDA_NAME + std::to_string(device)) {
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
~ggml_backend_cuda_context() {
|
~ggml_backend_cuda_context();
|
||||||
if (copy_event != nullptr) {
|
|
||||||
CUDA_CHECK(cudaEventDestroy(copy_event));
|
|
||||||
}
|
|
||||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
|
||||||
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
|
||||||
if (streams[i][j] != nullptr) {
|
|
||||||
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (cublas_handles[i] != nullptr) {
|
|
||||||
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaStream_t stream(int device, int stream) {
|
cudaStream_t stream(int device, int stream) {
|
||||||
if (streams[device][stream] == nullptr) {
|
if (streams[device][stream] == nullptr) {
|
||||||
|
|
91
ggml/src/ggml-cuda/conv2d-transpose.cu
Normal file
91
ggml/src/ggml-cuda/conv2d-transpose.cu
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include "conv2d-transpose.cuh"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
||||||
|
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
||||||
|
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
||||||
|
const int c_in, const int c_out, const int batches) {
|
||||||
|
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
const int total_elements = out_w * out_h * c_out * batches;
|
||||||
|
|
||||||
|
if (global_idx >= total_elements) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int out_x_idx = global_idx % out_w;
|
||||||
|
const int out_y_idx = (global_idx / out_w) % out_h;
|
||||||
|
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
||||||
|
const int n_idx = global_idx / (out_w * out_h * c_out);
|
||||||
|
|
||||||
|
float accumulator = 0;
|
||||||
|
// For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
|
||||||
|
|
||||||
|
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
||||||
|
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||||
|
int in_y = out_y_idx - kh;
|
||||||
|
if (in_y < 0 || in_y % stride) continue;
|
||||||
|
in_y /= stride;
|
||||||
|
if (in_y >= in_h) continue;
|
||||||
|
|
||||||
|
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||||
|
int in_x = out_x_idx - kw;
|
||||||
|
if (in_x < 0 || in_x % stride) continue;
|
||||||
|
in_x /= stride;
|
||||||
|
if (in_x >= in_w) continue;
|
||||||
|
|
||||||
|
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
||||||
|
const int kernel_idx =
|
||||||
|
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
||||||
|
|
||||||
|
float input_val = input[input_idx];
|
||||||
|
half kern_val = kernel[kernel_idx];
|
||||||
|
|
||||||
|
accumulator += input_val * (float) kern_val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
|
||||||
|
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * kernel = dst->src[0];
|
||||||
|
const ggml_tensor * input = dst->src[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const float * input_data = (const float *) input->data;
|
||||||
|
float * output_data = (float *) dst->data;
|
||||||
|
const half * kernel_data = (const half *) kernel->data;
|
||||||
|
|
||||||
|
const int input_w = input->ne[0];
|
||||||
|
const int input_h = input->ne[1];
|
||||||
|
const int output_w = dst->ne[0];
|
||||||
|
const int output_h = dst->ne[1];
|
||||||
|
const int channels_in = input->ne[2];
|
||||||
|
const int channels_out = kernel->ne[2];
|
||||||
|
const int kernel_w = kernel->ne[0];
|
||||||
|
const int kernel_h = kernel->ne[1];
|
||||||
|
const int stride = dst->op_params[0];
|
||||||
|
const int batches = input->ne[3];
|
||||||
|
|
||||||
|
GGML_ASSERT(channels_in == kernel->ne[3]);
|
||||||
|
GGML_ASSERT(stride > 0);
|
||||||
|
|
||||||
|
cudaStream_t st = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(input));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
const int total = (output_w * output_h * channels_out * batches);
|
||||||
|
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||||
|
|
||||||
|
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||||
|
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
||||||
|
channels_in, channels_out, batches);
|
||||||
|
}
|
4
ggml/src/ggml-cuda/conv2d-transpose.cuh
Normal file
4
ggml/src/ggml-cuda/conv2d-transpose.cuh
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||||
|
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -14,6 +14,7 @@ bool g_mul_mat_q = true;
|
||||||
#include "ggml-cuda/concat.cuh"
|
#include "ggml-cuda/concat.cuh"
|
||||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||||
#include "ggml-cuda/conv2d-dw.cuh"
|
#include "ggml-cuda/conv2d-dw.cuh"
|
||||||
|
#include "ggml-cuda/conv2d-transpose.cuh"
|
||||||
#include "ggml-cuda/convert.cuh"
|
#include "ggml-cuda/convert.cuh"
|
||||||
#include "ggml-cuda/count-equal.cuh"
|
#include "ggml-cuda/count-equal.cuh"
|
||||||
#include "ggml-cuda/cpy.cuh"
|
#include "ggml-cuda/cpy.cuh"
|
||||||
|
@ -50,6 +51,7 @@ bool g_mul_mat_q = true;
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <charconv>
|
#include <charconv>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <condition_variable>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
@ -57,9 +59,8 @@ bool g_mul_mat_q = true;
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -516,6 +517,33 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
||||||
|
// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
|
||||||
|
|
||||||
|
static std::mutex ggml_cuda_lock;
|
||||||
|
static std::condition_variable ggml_cuda_lock_cv;
|
||||||
|
static std::atomic<int> ggml_cuda_lock_counter;
|
||||||
|
|
||||||
|
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||||
|
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
|
||||||
|
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
|
||||||
|
|
||||||
|
if (copy_event != nullptr) {
|
||||||
|
CUDA_CHECK(cudaEventDestroy(copy_event));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
||||||
|
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
||||||
|
if (streams[i][j] != nullptr) {
|
||||||
|
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cublas_handles[i] != nullptr) {
|
||||||
|
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// cuda buffer
|
// cuda buffer
|
||||||
|
|
||||||
struct ggml_backend_cuda_buffer_context {
|
struct ggml_backend_cuda_buffer_context {
|
||||||
|
@ -2319,6 +2347,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
ggml_cuda_op_conv2d_dw(ctx, dst);
|
ggml_cuda_op_conv2d_dw(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
|
ggml_cuda_conv_2d_transpose_p0(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2694,6 +2725,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
|
|
||||||
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
||||||
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
||||||
|
if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
|
||||||
|
ggml_cuda_lock_cv.notify_all();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
||||||
}
|
}
|
||||||
|
@ -2769,7 +2805,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
if (use_cuda_graph && cuda_graph_update_required) {
|
||||||
|
// Start CUDA graph capture
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
||||||
|
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3219,6 +3261,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
}
|
}
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM:
|
case GGML_OP_SUM:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
|
|
@ -198,6 +198,7 @@ class Keys:
|
||||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
|
ADD_SEP = "tokenizer.ggml.add_sep_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||||
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||||
|
|
|
@ -891,6 +891,9 @@ class GGUFWriter:
|
||||||
def add_add_eos_token(self, value: bool) -> None:
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_add_sep_token(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_SEP, value)
|
||||||
|
|
||||||
def add_add_space_prefix(self, value: bool) -> None:
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
|
|
@ -119,6 +119,7 @@ class SpecialVocab:
|
||||||
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
||||||
|
|
||||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
|
tokenizer = None
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if tokenizer_file.is_file():
|
if tokenizer_file.is_file():
|
||||||
with open(tokenizer_file, encoding = 'utf-8') as f:
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
|
@ -152,11 +153,87 @@ class SpecialVocab:
|
||||||
added_tokens = tokenizer.get('added_tokens', {})
|
added_tokens = tokenizer.get('added_tokens', {})
|
||||||
else:
|
else:
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
|
tokenizer_config = None
|
||||||
tokenizer_config_file = path / 'tokenizer_config.json'
|
tokenizer_config_file = path / 'tokenizer_config.json'
|
||||||
if not tokenizer_config_file.is_file():
|
if tokenizer_config_file.is_file():
|
||||||
return True
|
|
||||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
|
if tokenizer:
|
||||||
|
special_bos = (tokenizer_config or {}).get('bos_token')
|
||||||
|
special_cls = (tokenizer_config or {}).get('cls_token')
|
||||||
|
special_eos = (tokenizer_config or {}).get('eos_token')
|
||||||
|
special_sep = (tokenizer_config or {}).get('sep_token')
|
||||||
|
if not special_bos and special_cls and tokenizer_config:
|
||||||
|
tokenizer_config['bos_token'] = special_bos = special_cls
|
||||||
|
if not special_eos and special_sep and tokenizer_config:
|
||||||
|
tokenizer_config['eos_token'] = special_eos = special_sep
|
||||||
|
post_processor = tokenizer.get('post_processor', {})
|
||||||
|
for processor in post_processor.get('processors', [post_processor]):
|
||||||
|
if processor.get('type') == 'RobertaProcessing':
|
||||||
|
self.add_special_token['bos'] = True
|
||||||
|
self.add_special_token['eos'] = True
|
||||||
|
self.add_special_token['sep'] = True
|
||||||
|
if not special_cls and tokenizer_config:
|
||||||
|
special_cls = processor.get('cls', [special_bos])[0]
|
||||||
|
tokenizer_config['cls_token'] = special_cls
|
||||||
|
if not special_sep and tokenizer_config:
|
||||||
|
special_sep = processor.get('sep', [special_eos])[0]
|
||||||
|
tokenizer_config['sep_token'] = special_sep
|
||||||
|
continue
|
||||||
|
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
|
||||||
|
# Only works with simple templates, **will** get it wrong on unusual sequences
|
||||||
|
if processor.get('type') == 'TemplateProcessing':
|
||||||
|
tmpl_single = processor.get('single', [])
|
||||||
|
tmpl_pair = processor.get('pair', [])
|
||||||
|
special_first = None
|
||||||
|
special_last = None
|
||||||
|
if len(tmpl_single) > 1:
|
||||||
|
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
|
||||||
|
if not tokenizer_config:
|
||||||
|
special_bos = special_first
|
||||||
|
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
|
||||||
|
if special_first not in (special_bos, special_cls):
|
||||||
|
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
|
||||||
|
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
|
||||||
|
if not tokenizer_config:
|
||||||
|
special_eos = special_last
|
||||||
|
self.add_special_token['eos'] = True if special_last == special_eos else False
|
||||||
|
if special_last != special_eos:
|
||||||
|
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
|
||||||
|
if tmpl_pair:
|
||||||
|
seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
|
||||||
|
seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
|
||||||
|
if seq_start == 0 or seq_stop is None:
|
||||||
|
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
|
||||||
|
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
|
||||||
|
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
|
||||||
|
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
|
||||||
|
if tmpl_a != 'A' or tmpl_b != 'B':
|
||||||
|
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
|
||||||
|
# A [sep] [eos] B
|
||||||
|
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
|
||||||
|
add_sep = False
|
||||||
|
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
|
||||||
|
if special_entry in (special_sep, special_eos) and not special_last:
|
||||||
|
add_sep = True
|
||||||
|
if special_entry not in (special_sep, special_eos):
|
||||||
|
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
|
||||||
|
else:
|
||||||
|
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
|
||||||
|
if len(tmpl_pair) == 2:
|
||||||
|
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
|
||||||
|
if special_entry in (special_sep, special_eos):
|
||||||
|
add_sep = True
|
||||||
|
if special_entry not in (special_sep, special_eos):
|
||||||
|
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
|
||||||
|
else:
|
||||||
|
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
|
||||||
|
self.add_special_token['sep'] = add_sep
|
||||||
|
if add_sep and not special_sep and tokenizer_config:
|
||||||
|
tokenizer_config['sep_token'] = special_eos
|
||||||
|
continue
|
||||||
|
if not tokenizer_config:
|
||||||
|
return True
|
||||||
chat_template_alt = None
|
chat_template_alt = None
|
||||||
chat_template_file = path / 'chat_template.json'
|
chat_template_file = path / 'chat_template.json'
|
||||||
if chat_template_file.is_file():
|
if chat_template_file.is_file():
|
||||||
|
|
|
@ -1047,6 +1047,7 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
||||||
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
||||||
|
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
||||||
|
|
||||||
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
||||||
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
||||||
|
@ -1090,6 +1091,7 @@ extern "C" {
|
||||||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
||||||
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
||||||
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||||
/// as plaintext. Does not insert a leading space.
|
/// as plaintext. Does not insert a leading space.
|
||||||
|
|
|
@ -198,6 +198,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||||
|
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
|
||||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||||
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
|
||||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||||
|
|
|
@ -194,6 +194,7 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_MASK_ID,
|
LLM_KV_TOKENIZER_MASK_ID,
|
||||||
LLM_KV_TOKENIZER_ADD_BOS,
|
LLM_KV_TOKENIZER_ADD_BOS,
|
||||||
LLM_KV_TOKENIZER_ADD_EOS,
|
LLM_KV_TOKENIZER_ADD_EOS,
|
||||||
|
LLM_KV_TOKENIZER_ADD_SEP,
|
||||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||||
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
||||||
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
||||||
|
|
|
@ -228,6 +228,7 @@ void llama_model_saver::add_kv_from_model() {
|
||||||
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_SEP, vocab.get_add_sep());
|
||||||
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
||||||
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
||||||
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
||||||
|
|
|
@ -1494,6 +1494,7 @@ struct llama_vocab::impl {
|
||||||
bool add_space_prefix = false;
|
bool add_space_prefix = false;
|
||||||
bool add_bos = false;
|
bool add_bos = false;
|
||||||
bool add_eos = false;
|
bool add_eos = false;
|
||||||
|
bool add_sep = false;
|
||||||
bool ignore_merges = false;
|
bool ignore_merges = false;
|
||||||
bool clean_spaces = false; // clean_up_tokenization_spaces
|
bool clean_spaces = false; // clean_up_tokenization_spaces
|
||||||
bool remove_extra_whitespaces = false;
|
bool remove_extra_whitespaces = false;
|
||||||
|
@ -1647,6 +1648,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
special_sep_id = 102;
|
special_sep_id = 102;
|
||||||
special_pad_id = 0;
|
special_pad_id = 0;
|
||||||
special_mask_id = 103;
|
special_mask_id = 103;
|
||||||
|
|
||||||
|
add_sep = true;
|
||||||
} else if (tokenizer_model == "gpt2") {
|
} else if (tokenizer_model == "gpt2") {
|
||||||
type = LLAMA_VOCAB_TYPE_BPE;
|
type = LLAMA_VOCAB_TYPE_BPE;
|
||||||
|
|
||||||
|
@ -1786,12 +1789,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "jina-es" ||
|
tokenizer_pre == "jina-es" ||
|
||||||
tokenizer_pre == "jina-de" ||
|
tokenizer_pre == "jina-de" ||
|
||||||
tokenizer_pre == "gigachat" ||
|
tokenizer_pre == "gigachat" ||
|
||||||
tokenizer_pre == "jina-v1-en" ||
|
|
||||||
tokenizer_pre == "jina-v2-es" ||
|
tokenizer_pre == "jina-v2-es" ||
|
||||||
tokenizer_pre == "jina-v2-de" ||
|
tokenizer_pre == "jina-v2-de") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jina-v1-en" ||
|
||||||
tokenizer_pre == "jina-v2-code" ||
|
tokenizer_pre == "jina-v2-code" ||
|
||||||
tokenizer_pre == "roberta-bpe") {
|
tokenizer_pre == "roberta-bpe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||||
|
add_sep = true;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "refact") {
|
tokenizer_pre == "refact") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
||||||
|
@ -1901,6 +1907,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
clean_spaces = true;
|
clean_spaces = true;
|
||||||
add_bos = true;
|
add_bos = true;
|
||||||
add_eos = false;
|
add_eos = false;
|
||||||
|
add_sep = true;
|
||||||
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
|
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
add_bos = false;
|
add_bos = false;
|
||||||
|
@ -2040,7 +2047,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle add_bos and add_eos
|
// Handle add_bos, add_eos and add_sep
|
||||||
{
|
{
|
||||||
bool temp = true;
|
bool temp = true;
|
||||||
|
|
||||||
|
@ -2050,6 +2057,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
||||||
add_eos = temp;
|
add_eos = temp;
|
||||||
}
|
}
|
||||||
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
|
||||||
|
add_sep = temp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// auto-detect special tokens by text
|
// auto-detect special tokens by text
|
||||||
|
@ -3281,6 +3291,10 @@ bool llama_vocab::get_add_eos() const {
|
||||||
return pimpl->add_eos;
|
return pimpl->add_eos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_vocab::get_add_sep() const {
|
||||||
|
return pimpl->add_sep;
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_vocab::get_ignore_merges() const {
|
bool llama_vocab::get_ignore_merges() const {
|
||||||
return pimpl->ignore_merges;
|
return pimpl->ignore_merges;
|
||||||
}
|
}
|
||||||
|
@ -3348,6 +3362,11 @@ int32_t llama_vocab::tokenize(
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special) const {
|
bool parse_special) const {
|
||||||
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
|
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
|
||||||
|
if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
||||||
|
LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
|
||||||
|
return std::numeric_limits<int32_t>::min();
|
||||||
|
}
|
||||||
|
|
||||||
if (n_tokens_max < (int) res.size()) {
|
if (n_tokens_max < (int) res.size()) {
|
||||||
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
return -((int) res.size());
|
return -((int) res.size());
|
||||||
|
@ -3479,6 +3498,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
|
||||||
return vocab->get_add_eos();
|
return vocab->get_add_eos();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
|
||||||
|
return vocab->get_add_sep();
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
|
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
|
||||||
return vocab->token_fim_pre();
|
return vocab->token_fim_pre();
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,7 @@ struct llama_vocab {
|
||||||
bool get_add_space_prefix () const;
|
bool get_add_space_prefix () const;
|
||||||
bool get_add_bos () const;
|
bool get_add_bos () const;
|
||||||
bool get_add_eos () const;
|
bool get_add_eos () const;
|
||||||
|
bool get_add_sep () const;
|
||||||
bool get_ignore_merges () const;
|
bool get_ignore_merges () const;
|
||||||
bool get_clean_spaces () const;
|
bool get_clean_spaces () const;
|
||||||
bool get_remove_extra_whitespaces () const;
|
bool get_remove_extra_whitespaces () const;
|
||||||
|
|
|
@ -271,12 +271,20 @@ static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_
|
||||||
}
|
}
|
||||||
|
|
||||||
result.reserve(doc.size() + query.size() + 4);
|
result.reserve(doc.size() + query.size() + 4);
|
||||||
|
if (llama_vocab_get_add_bos(vocab)) {
|
||||||
result.push_back(llama_vocab_bos(vocab));
|
result.push_back(llama_vocab_bos(vocab));
|
||||||
|
}
|
||||||
result.insert(result.end(), query.begin(), query.end());
|
result.insert(result.end(), query.begin(), query.end());
|
||||||
|
if (llama_vocab_get_add_eos(vocab)) {
|
||||||
result.push_back(eos_token);
|
result.push_back(eos_token);
|
||||||
|
}
|
||||||
|
if (llama_vocab_get_add_sep(vocab)) {
|
||||||
result.push_back(llama_vocab_sep(vocab));
|
result.push_back(llama_vocab_sep(vocab));
|
||||||
|
}
|
||||||
result.insert(result.end(), doc.begin(), doc.end());
|
result.insert(result.end(), doc.begin(), doc.end());
|
||||||
|
if (llama_vocab_get_add_eos(vocab)) {
|
||||||
result.push_back(eos_token);
|
result.push_back(eos_token);
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue