mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/llama-server.Dockerfile # README.md # flake.lock # ggml/src/ggml-vulkan.cpp # ggml/src/vulkan-shaders/concat.comp # ggml/src/vulkan-shaders/pad.comp # ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # src/llama.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
e1f97f7fb5
55 changed files with 112612 additions and 111077 deletions
|
@ -2040,8 +2040,8 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
llama_init_result iparams;
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = llama_model_params_from_gpt_params(params);
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
|
@ -2056,7 +2056,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = llama_context_params_from_gpt_params(params);
|
auto cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
@ -2065,7 +2065,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.control_vectors.empty()) {
|
if (!params.control_vectors.empty()) {
|
||||||
|
@ -2076,7 +2076,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_control_vector_apply(lctx,
|
int err = llama_control_vector_apply(lctx,
|
||||||
|
@ -2088,7 +2088,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return iparams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2100,7 +2100,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return iparams;
|
||||||
}
|
}
|
||||||
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
llama_lora_adapter_set(lctx, adapter, lora_scale);
|
||||||
}
|
}
|
||||||
|
@ -2136,7 +2136,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
llama_reset_timings(lctx);
|
llama_reset_timings(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_tuple(model, lctx);
|
iparams.model = model;
|
||||||
|
iparams.context = lctx;
|
||||||
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
||||||
|
|
|
@ -330,8 +330,12 @@ std::string fs_get_cache_file(const std::string & filename);
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: avoid tuplue, use struct
|
struct llama_init_result {
|
||||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
|
struct llama_model * model = nullptr;
|
||||||
|
struct llama_context * context = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||||
|
|
|
@ -316,7 +316,7 @@ class Model:
|
||||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
data = gguf.quantize_bf16(data)
|
data = gguf.quantize_bf16(data)
|
||||||
assert data.dtype == np.int16
|
assert data.dtype == np.uint16
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||||
|
@ -2506,6 +2506,112 @@ class NomicBertModel(BertModel):
|
||||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("XLMRobertaModel")
|
||||||
|
class XLMRobertaModel(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||||
|
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||||
|
self._position_offset = 1 + pad_token_id
|
||||||
|
if "max_position_embeddings" in self.hparams:
|
||||||
|
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||||
|
else:
|
||||||
|
self._position_offset = None
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
# realign tokens (see HF tokenizer code)
|
||||||
|
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
||||||
|
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
||||||
|
toktypes = [
|
||||||
|
SentencePieceTokenTypes.CONTROL,
|
||||||
|
SentencePieceTokenTypes.CONTROL,
|
||||||
|
SentencePieceTokenTypes.CONTROL,
|
||||||
|
SentencePieceTokenTypes.UNKNOWN,
|
||||||
|
] + toktypes[3:-1]
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_token_type_count(1)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||||
|
if name == "embeddings.position_embeddings.weight":
|
||||||
|
if self._position_offset is not None:
|
||||||
|
data_torch = data_torch[self._position_offset:,:]
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GemmaForCausalLM")
|
@Model.register("GemmaForCausalLM")
|
||||||
class GemmaModel(Model):
|
class GemmaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "train.h"
|
#include "train.h"
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
|
@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
|
|
@ -414,9 +414,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model to get hparams
|
// load the model to get hparams
|
||||||
llama_model * model;
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// int n_ctx = llama_n_ctx(ctx);
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
int n_layers = llama_n_layer(model);
|
int n_layers = llama_n_layer(model);
|
||||||
|
|
|
@ -80,11 +80,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -163,9 +163,10 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -612,10 +612,10 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -180,7 +180,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
model = llama_init.model;
|
||||||
|
ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||||
|
|
|
@ -58,11 +58,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model = NULL;
|
|
||||||
llama_context * ctx = NULL;
|
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
|
|
@ -22,11 +22,11 @@ int main(int argc, char ** argv){
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model = NULL;
|
|
||||||
llama_context * ctx = NULL;
|
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
GGML_ASSERT(model != nullptr);
|
GGML_ASSERT(model != nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
|
@ -26,11 +26,11 @@ int main(int argc, char ** argv){
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model = NULL;
|
|
||||||
llama_context * ctx = NULL;
|
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
|
|
@ -34,11 +34,11 @@ int main(int argc, char ** argv){
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model = NULL;
|
|
||||||
llama_context * ctx = NULL;
|
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
|
|
@ -208,7 +208,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
model = llama_init.model;
|
||||||
|
ctx = llama_init.context;
|
||||||
if (sparams.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
|
|
|
@ -131,11 +131,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model = NULL;
|
|
||||||
llama_context * ctx = NULL;
|
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
|
|
|
@ -2019,11 +2019,11 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -148,11 +148,12 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -29,10 +29,11 @@ int main(int argc, char ** argv) {
|
||||||
std::string result2;
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
llama_context * ctx;
|
|
||||||
|
llama_model * model = llama_init.model;
|
||||||
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -678,7 +678,10 @@ struct server_context {
|
||||||
// dedicate one sequence to the system prompt
|
// dedicate one sequence to the system prompt
|
||||||
params.n_parallel += 1;
|
params.n_parallel += 1;
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
model = llama_init.model;
|
||||||
|
ctx = llama_init.context;
|
||||||
params.n_parallel -= 1; // but be sneaky about it
|
params.n_parallel -= 1; // but be sneaky about it
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||||
|
@ -901,7 +904,7 @@ struct server_context {
|
||||||
|
|
||||||
slot.params.stream = json_value(data, "stream", false);
|
slot.params.stream = json_value(data, "stream", false);
|
||||||
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
||||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
|
|
|
@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse(
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
// Map OpenAI parameters to llama.cpp parameters
|
|
||||||
//
|
|
||||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
|
||||||
// temperature), we explicitly specify OpenAI's intended default; we
|
|
||||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
|
||||||
//
|
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
|
||||||
llama_sampling_params default_sparams;
|
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
|
||||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
|
||||||
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
|
|
||||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
|
||||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
||||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
||||||
llama_params["stream"] = json_value(body, "stream", false);
|
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
||||||
|
|
||||||
// Apply chat template to the list of messages
|
// Apply chat template to the list of messages
|
||||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,9 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
|
||||||
|
model_tgt = llama_init_tgt.model;
|
||||||
|
ctx_tgt = llama_init_tgt.context;
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
|
@ -77,7 +79,9 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads = params.n_threads_draft;
|
params.n_threads = params.n_threads_draft;
|
||||||
}
|
}
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
params.n_threads_batch = params.n_threads_batch_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||||
|
model_dft = llama_init_dft.model;
|
||||||
|
ctx_dft = llama_init_dft.context;
|
||||||
|
|
||||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||||
|
|
|
@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
|
|
||||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||||
|
|
|
@ -355,6 +355,7 @@ extern "C" {
|
||||||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||||
|
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
|
||||||
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
|
@ -1145,16 +1146,17 @@ extern "C" {
|
||||||
|
|
||||||
// group normalize along ne0*ne1*n_groups
|
// group normalize along ne0*ne1*n_groups
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
// TODO: eps is hardcoded to 1e-6 for now
|
|
||||||
GGML_API struct ggml_tensor * ggml_group_norm(
|
GGML_API struct ggml_tensor * ggml_group_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_groups);
|
int n_groups,
|
||||||
|
float eps);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
|
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_groups);
|
int n_groups,
|
||||||
|
float eps);
|
||||||
|
|
||||||
// a - x
|
// a - x
|
||||||
// b - dy
|
// b - dy
|
||||||
|
@ -1461,7 +1463,6 @@ extern "C" {
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if mode & 2 == 1, GPT-NeoX style
|
||||||
//
|
//
|
||||||
// b is an int32 vector with size a->ne[2], it contains the positions
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
||||||
// c is freq factors (e.g. phi3-128k), (optional)
|
|
||||||
GGML_API struct ggml_tensor * ggml_rope(
|
GGML_API struct ggml_tensor * ggml_rope(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1478,6 +1479,7 @@ extern "C" {
|
||||||
int mode);
|
int mode);
|
||||||
|
|
||||||
// custom RoPE
|
// custom RoPE
|
||||||
|
// c is freq factors (e.g. phi3-128k), (optional)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_ext(
|
GGML_API struct ggml_tensor * ggml_rope_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
|
@ -384,8 +384,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -496,8 +496,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -614,7 +614,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon()) {
|
else if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||||
"quantization format for optimal performance");
|
"quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
|
@ -745,8 +745,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1266,8 +1266,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1728,7 +1728,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon()) {
|
else if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||||
"quantization format for optimal performance");
|
"quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
|
|
|
@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
||||||
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||||
const void* src,
|
const void* src,
|
||||||
void* dst) {
|
void* dst) {
|
||||||
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
|
||||||
|
|
||||||
int64_t n_elems = ggml_nelements(tensor);
|
int64_t n_elems = ggml_nelements(tensor);
|
||||||
int64_t groups = n_elems / QK4_0;
|
int64_t groups = n_elems / QK4_0;
|
||||||
|
@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
||||||
const ggml_tensor* tensor, void* src, void* dst) {
|
const ggml_tensor* tensor, void* src, void* dst) {
|
||||||
GGML_ASSERT(tensor->op == GGML_OP_NONE);
|
|
||||||
|
|
||||||
int64_t n_elems = ggml_nelements(tensor);
|
int64_t n_elems = ggml_nelements(tensor);
|
||||||
int64_t groups = n_elems / QK4_0;
|
int64_t groups = n_elems / QK4_0;
|
||||||
|
@ -898,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
||||||
* @param size Size of the data to be copied, in bytes.
|
* @param size Size of the data to be copied, in bytes.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
||||||
ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
|
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
// GGML_ASSERT(size == ggml_nbytes(tensor));
|
ggml_backend_cann_buffer_context *ctx =
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
(ggml_backend_cann_buffer_context *)buffer->context;
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
||||||
|
|
||||||
ggml_cann_set_device(ctx->device);
|
ggml_cann_set_device(ctx->device);
|
||||||
// TODO: refer to cann(#6017), it use thread's default stream.
|
// TODO: refer to cann(#6017), it use thread's default stream.
|
||||||
|
@ -910,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
||||||
// Why aclrtSynchronizeDevice?
|
// Why aclrtSynchronizeDevice?
|
||||||
|
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
|
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||||
size, ACL_MEMCPY_HOST_TO_DEVICE));
|
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||||
} else {
|
} else {
|
||||||
void* transform_buffer = malloc(size);
|
void *transform_buffer = malloc(size);
|
||||||
ggml_backend_cann_transform(tensor, (const char*)data + offset,
|
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||||
transform_buffer);
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
void* check_buffer = malloc(size);
|
void *check_buffer = malloc(size);
|
||||||
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
||||||
check_buffer);
|
check_buffer);
|
||||||
GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
|
GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
|
||||||
0);
|
|
||||||
free(check_buffer);
|
free(check_buffer);
|
||||||
#endif
|
#endif
|
||||||
ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
|
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
|
||||||
|
transform_buffer, size,
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||||
free(transform_buffer);
|
free(transform_buffer);
|
||||||
}
|
}
|
||||||
|
@ -947,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
||||||
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
|
|
||||||
ggml_cann_set_device(ctx->device);
|
ggml_cann_set_device(ctx->device);
|
||||||
|
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
|
ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
|
||||||
ACL_MEMCPY_DEVICE_TO_HOST));
|
ACL_MEMCPY_DEVICE_TO_HOST));
|
||||||
} else {
|
} else {
|
||||||
void* transform_buffer = malloc(size);
|
void* transform_buffer = malloc(size);
|
||||||
ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
|
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
|
||||||
|
(char*)tensor->data + offset, size,
|
||||||
ACL_MEMCPY_DEVICE_TO_HOST));
|
ACL_MEMCPY_DEVICE_TO_HOST));
|
||||||
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
||||||
(char*)data + offset);
|
|
||||||
free(transform_buffer);
|
free(transform_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1450,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
* @param size Size of the data to copy in bytes.
|
* @param size Size of the data to copy in bytes.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||||
ggml_tensor* tensor,
|
ggml_tensor *tensor,
|
||||||
const void* data,
|
const void *data,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context *cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context *)backend->context;
|
||||||
|
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpyAsync(
|
ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
|
||||||
tensor->data, size, (const char*)data + offset, size,
|
size, ACL_MEMCPY_HOST_TO_DEVICE,
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
|
cann_ctx->stream()));
|
||||||
} else {
|
} else {
|
||||||
void* transform_buffer = malloc(size);
|
void *transform_buffer = malloc(size);
|
||||||
ggml_backend_cann_transform(tensor, (const char*)data + offset,
|
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||||
transform_buffer);
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
void* check_buffer = malloc(size);
|
void *check_buffer = malloc(size);
|
||||||
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
||||||
check_buffer);
|
check_buffer);
|
||||||
GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
|
GGML_ASSERT(memcmp(data, check_buffer, size));
|
||||||
free(check_buffer);
|
free(check_buffer);
|
||||||
#endif
|
#endif
|
||||||
ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
|
ACL_CHECK(aclrtMemcpyAsync(
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE,
|
(char *)tensor->data + offset, size, transform_buffer, size,
|
||||||
cann_ctx->stream()));
|
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
|
||||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||||
free(transform_buffer);
|
free(transform_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
||||||
ggml_backend_t backend, const ggml_tensor* tensor, void* data,
|
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context *cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf =
|
ggml_backend_buffer_t buf =
|
||||||
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
||||||
|
@ -1493,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
||||||
"unsupported buffer type");
|
"unsupported buffer type");
|
||||||
|
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
|
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
|
||||||
size, ACL_MEMCPY_DEVICE_TO_HOST,
|
size, ACL_MEMCPY_DEVICE_TO_HOST,
|
||||||
cann_ctx->stream()));
|
cann_ctx->stream()));
|
||||||
} else {
|
} else {
|
||||||
void* transform_buffer = malloc(size);
|
void *transform_buffer = malloc(size);
|
||||||
ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
|
ACL_CHECK(aclrtMemcpyAsync(
|
||||||
ACL_MEMCPY_DEVICE_TO_HOST,
|
transform_buffer, size, (char *)tensor->data + offset, size,
|
||||||
cann_ctx->stream()));
|
ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
|
||||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||||
ggml_backend_cann_transform_back(tensor, transform_buffer,
|
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
||||||
(char*)data + offset);
|
|
||||||
free(transform_buffer);
|
free(transform_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1666,10 +1659,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
||||||
}
|
}
|
||||||
case GGML_OP_MUL_MAT: {
|
case GGML_OP_MUL_MAT: {
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
// case GGML_TYPE_Q4_0:
|
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
// TODO: fix me
|
||||||
|
// Current groupsize should not be greater than k-1 in
|
||||||
|
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -1694,6 +1690,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
||||||
return ACL_INT16;
|
return ACL_INT16;
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
return ACL_INT32;
|
return ACL_INT32;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
return ACL_INT4;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
return ACL_INT8;
|
||||||
default:
|
default:
|
||||||
return ACL_DT_UNDEFINED;
|
return ACL_DT_UNDEFINED;
|
||||||
}
|
}
|
||||||
|
@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
|
||||||
size_t type_size, int64_t* ne, size_t* nb,
|
|
||||||
int64_t dims, aclFormat format,
|
|
||||||
size_t offset) {
|
|
||||||
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
|
||||||
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
|
||||||
|
|
||||||
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
|
||||||
for (int i = 0; i < dims; i++) {
|
|
||||||
tmp_stride[i] = nb[i] / type_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::reverse(tmp_ne, tmp_ne + dims);
|
|
||||||
std::reverse(tmp_stride, tmp_stride + dims);
|
|
||||||
|
|
||||||
int64_t acl_storage_len = 0;
|
|
||||||
for (int i = 0; i < dims; i++) {
|
|
||||||
acl_storage_len += (ne[i] - 1) * nb[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
aclTensor* acl_tensor =
|
|
||||||
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
|
||||||
format, &acl_storage_len, 1, data_ptr);
|
|
||||||
|
|
||||||
return acl_tensor;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
|
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
|
||||||
const ggml_tensor* src1,
|
const ggml_tensor* src1,
|
||||||
int64_t* bcast_src0_ne,
|
int64_t* bcast_src0_ne,
|
||||||
|
|
|
@ -23,6 +23,9 @@
|
||||||
#ifndef CANN_ACL_TENSOR_H
|
#ifndef CANN_ACL_TENSOR_H
|
||||||
#define CANN_ACL_TENSOR_H
|
#define CANN_ACL_TENSOR_H
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#include <aclnn/aclnn_base.h>
|
#include <aclnn/aclnn_base.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
||||||
size_t offset = 0);
|
size_t offset = 0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Creates an ACL tensor from provided parameters.
|
* @brief Template for creating an ACL tensor from provided parameters. typename TYPE
|
||||||
|
* should be size_t or float.
|
||||||
*
|
*
|
||||||
* @details This function creates an ACL tensor using the provided data pointer,
|
* @details This function creates an ACL tensor using the provided data pointer,
|
||||||
* data type, dimensions, strides, format, offset, and additional parameters.
|
* data type, dimensions, strides, format, offset, and additional parameters.
|
||||||
|
@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
||||||
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
||||||
* @return Pointer to the created ACL tensor.
|
* @return Pointer to the created ACL tensor.
|
||||||
*/
|
*/
|
||||||
|
template<typename TYPE>
|
||||||
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
||||||
size_t type_size, int64_t* ne, size_t* nb,
|
TYPE type_size, int64_t* ne, TYPE* nb,
|
||||||
int64_t dims, aclFormat format = ACL_FORMAT_ND,
|
int64_t dims,
|
||||||
size_t offset = 0);
|
aclFormat format = ACL_FORMAT_ND,
|
||||||
|
size_t offset = 0) {
|
||||||
|
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
||||||
|
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
||||||
|
|
||||||
|
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
||||||
|
for (int i = 0; i < dims; i++) {
|
||||||
|
tmp_stride[i] = nb[i] / type_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::reverse(tmp_ne, tmp_ne + dims);
|
||||||
|
std::reverse(tmp_stride, tmp_stride + dims);
|
||||||
|
|
||||||
|
int64_t acl_storage_len = 0;
|
||||||
|
for (int i = 0; i < dims; i++) {
|
||||||
|
acl_storage_len += (ne[i] - 1) * nb[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
aclTensor* acl_tensor =
|
||||||
|
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
||||||
|
format, &acl_storage_len, 1, data_ptr);
|
||||||
|
|
||||||
|
return acl_tensor;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Checks if tensors require broadcasting based on their shapes.
|
* @brief Checks if tensors require broadcasting based on their shapes.
|
||||||
|
|
|
@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||||
|
|
||||||
const float eps = 1e-6f; // TODO: make this a parameter
|
|
||||||
int n_groups = dst->op_params[0];
|
int n_groups = dst->op_params[0];
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
uint64_t workspaceSize = 0;
|
uint64_t workspaceSize = 0;
|
||||||
aclOpExecutor* executor;
|
aclOpExecutor* executor;
|
||||||
void* workspaceAddr = nullptr;
|
void* workspaceAddr = nullptr;
|
||||||
|
@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->ne);
|
((ggml_tensor*)dst->extra)->ne);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (dst->type == GGML_TYPE_Q4_0) {
|
||||||
|
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
||||||
|
24, ctx.stream(), src->data, dst->data,
|
||||||
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||||
|
((ggml_tensor*)dst->extra)->ne);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (dst->type == GGML_TYPE_F16) {
|
if (dst->type == GGML_TYPE_F16) {
|
||||||
if (ggml_are_same_shape(src, dst)) {
|
if (ggml_are_same_shape(src, dst)) {
|
||||||
cann_copy(ctx, acl_src, acl_dst);
|
cann_copy(ctx, acl_src, acl_dst);
|
||||||
|
@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->ne);
|
((ggml_tensor*)dst->extra)->ne);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (dst->type == GGML_TYPE_Q4_0) {
|
||||||
|
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
||||||
|
24, ctx.stream(), src->data, dst->data,
|
||||||
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||||
|
((ggml_tensor*)dst->extra)->ne);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
if (ggml_are_same_shape(src, dst)) {
|
if (ggml_are_same_shape(src, dst)) {
|
||||||
cann_copy(ctx, acl_src, acl_dst);
|
cann_copy(ctx, acl_src, acl_dst);
|
||||||
|
@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
|
||||||
|
ggml_tensor* dst,
|
||||||
|
ggml_tensor* src1,
|
||||||
|
aclTensor* tmp_cast_tensor,
|
||||||
|
aclTensor* tmp_im2col_tensor) {
|
||||||
|
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
||||||
|
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
||||||
|
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
||||||
|
aclTensor* acl_dst =
|
||||||
|
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
||||||
|
|
||||||
|
int64_t permute_dim[] = {0, 2, 1};
|
||||||
|
if (src1->type != dst->type) {
|
||||||
|
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
||||||
|
} else {
|
||||||
|
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// release
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cann_im2col_1d_post_process(
|
||||||
|
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
|
||||||
|
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
|
||||||
|
const std::vector<int64_t>& im2col_op_params) {
|
||||||
|
// get params
|
||||||
|
const int64_t KH = im2col_op_params[0];
|
||||||
|
const int64_t KW = im2col_op_params[1];
|
||||||
|
const int64_t IW = im2col_op_params[2];
|
||||||
|
const int64_t IC = im2col_op_params[3];
|
||||||
|
const int64_t N = im2col_op_params[4];
|
||||||
|
const int64_t OH = im2col_op_params[5];
|
||||||
|
const int64_t OW = im2col_op_params[6];
|
||||||
|
const int64_t s0 = im2col_op_params[7];
|
||||||
|
const int64_t p0 = im2col_op_params[8];
|
||||||
|
const int64_t d0 = im2col_op_params[9];
|
||||||
|
const int64_t n_bytes_factor = im2col_op_params[10];
|
||||||
|
|
||||||
|
// Permute: [N, IC * KH * KW, OW * OH] ->
|
||||||
|
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
||||||
|
aclTensor* tmp_permute_tensor = nullptr;
|
||||||
|
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
||||||
|
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||||
|
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
||||||
|
|
||||||
|
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
||||||
|
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
||||||
|
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||||
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp_permute_tensor = ggml_cann_create_tensor(
|
||||||
|
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
||||||
|
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
|
||||||
|
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
||||||
|
|
||||||
|
int64_t permute_dim[] = {0, 2, 1};
|
||||||
|
if (src1->type != dst->type) {
|
||||||
|
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
||||||
|
} else {
|
||||||
|
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
||||||
|
3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of times the kernel moves in W dimension
|
||||||
|
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
||||||
|
size_t offset;
|
||||||
|
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
||||||
|
|
||||||
|
// memory copy with offset to restore 1D im2col from 2d
|
||||||
|
if (IC > 1) {
|
||||||
|
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||||
|
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
for (int c = 0; c < IC; c++) {
|
||||||
|
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
||||||
|
KH * KW * c * ggml_type_size(dst->type);
|
||||||
|
cur_dst_buffer = (char*)dst->data +
|
||||||
|
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_step_w; i++) {
|
||||||
|
ACL_CHECK(aclrtMemcpyAsync(
|
||||||
|
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
|
||||||
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||||
|
cur_dst_buffer =
|
||||||
|
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
||||||
|
cur_permute_buffer = (char*)cur_permute_buffer +
|
||||||
|
KH * KW * IC * ggml_type_size(dst->type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
offset = KH * KW * n_step_w *
|
||||||
|
ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
||||||
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
|
||||||
|
(char*)tmp_permute_buffer + offset, offset,
|
||||||
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// release
|
||||||
|
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_tensor* src0 = dst->src[0]; // kernel
|
ggml_tensor* src0 = dst->src[0]; // kernel
|
||||||
ggml_tensor* src1 = dst->src[1]; // input
|
ggml_tensor* src1 = dst->src[1]; // input
|
||||||
|
@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
|
||||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
||||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
|
||||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
|
||||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
|
||||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
|
||||||
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
const int64_t N = is_2D ? ne13 : ne12;
|
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
||||||
const int64_t IC = is_2D ? ne12 : ne11;
|
// im2col and do post-processing to restore it to 1D.
|
||||||
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
||||||
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||||
|
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
|
||||||
|
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||||
|
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
|
||||||
|
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||||
|
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
|
||||||
|
|
||||||
const int64_t KH = is_2D ? ne01 : 1;
|
const int64_t N = ne13;
|
||||||
|
const int64_t IC = ne12;
|
||||||
|
const int64_t KH = ne01;
|
||||||
const int64_t KW = ne00;
|
const int64_t KW = ne00;
|
||||||
|
const int64_t IW = ne10;
|
||||||
|
|
||||||
const int64_t OH = is_2D ? ne2 : 1;
|
const int64_t OH = is_2D ? ne2 : 1;
|
||||||
const int64_t OW = ne1;
|
const int64_t OW = ne1;
|
||||||
|
@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
|
// memory allocated increased to 3x when is_2D == false
|
||||||
|
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
||||||
|
|
||||||
|
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
||||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||||
int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
|
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
|
||||||
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
||||||
|
|
||||||
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
||||||
|
@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
||||||
// dst.elemcount.
|
// dst.elemcount.
|
||||||
ggml_cann_pool_alloc im2col_allocator(
|
ggml_cann_pool_alloc im2col_allocator(
|
||||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
|
ctx.pool(),
|
||||||
|
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
||||||
void* tmp_im2col_buffer = im2col_allocator.get();
|
void* tmp_im2col_buffer = im2col_allocator.get();
|
||||||
|
|
||||||
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
||||||
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
||||||
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
||||||
|
@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
paddings, strides, tmp_im2col_tensor,
|
paddings, strides, tmp_im2col_tensor,
|
||||||
&workspaceSize, &executor));
|
&workspaceSize, &executor));
|
||||||
|
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
workspace_allocator.alloc(workspaceSize);
|
||||||
workspaceAddr = workspace_allocator.get();
|
workspaceAddr = workspace_allocator.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
// Cast if dst is f16.
|
// Cast if dst is f16.
|
||||||
aclTensor* tmp_cast_tensor = nullptr;
|
aclTensor* tmp_cast_tensor = nullptr;
|
||||||
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
||||||
|
void* tmp_cast_buffer = nullptr;
|
||||||
if (src1->type != dst->type) {
|
if (src1->type != dst->type) {
|
||||||
tmp_cast_allocator.alloc(ggml_nbytes(dst));
|
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||||
void* tmp_cast_buffer = tmp_cast_allocator.get();
|
tmp_cast_buffer = tmp_cast_allocator.get();
|
||||||
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
||||||
temp_cast_nb[0] = ggml_type_size(dst->type);
|
temp_cast_nb[0] = ggml_type_size(dst->type);
|
||||||
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||||
|
@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_cann_type_mapping(dst->type));
|
ggml_cann_type_mapping(dst->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
// post-processing
|
||||||
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
if (is_2D) {
|
||||||
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||||
aclTensor* acl_dst =
|
tmp_im2col_tensor);
|
||||||
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
||||||
|
|
||||||
int64_t permute_dim[] = {0, 2, 1};
|
|
||||||
if (src1->type != dst->type) {
|
|
||||||
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
|
||||||
} else {
|
} else {
|
||||||
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
std::vector<int64_t> im2col_op_params = {
|
||||||
|
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
|
||||||
|
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||||
|
tmp_im2col_tensor, im2col_op_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
// release
|
// release
|
||||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||||
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
||||||
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
||||||
ACL_CHECK(aclDestroyIntArray(dilations));
|
ACL_CHECK(aclDestroyIntArray(dilations));
|
||||||
ACL_CHECK(aclDestroyIntArray(paddings));
|
ACL_CHECK(aclDestroyIntArray(paddings));
|
||||||
|
@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||||
* @param dst The destination tensor where the result of the matrix
|
* @param dst The destination tensor where the result of the matrix
|
||||||
* multiplication will be stored.
|
* multiplication will be stored.
|
||||||
*/
|
*/
|
||||||
static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||||
ggml_tensor* dst) {
|
ggml_tensor* dst,
|
||||||
|
const enum ggml_type type) {
|
||||||
ggml_tensor* src0 = dst->src[0]; // weight
|
ggml_tensor* src0 = dst->src[0]; // weight
|
||||||
ggml_tensor* src1 = dst->src[1]; // input
|
ggml_tensor* src1 = dst->src[1]; // input
|
||||||
|
|
||||||
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
||||||
// is regarded as batch. weight need transpose.
|
// is regarded as batch. weight need transpose.
|
||||||
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
||||||
size_t weight_elem_size = sizeof(uint8_t);
|
float weight_elem_size;
|
||||||
size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
if (type == GGML_TYPE_Q4_0) {
|
||||||
|
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||||
|
}
|
||||||
|
else if (type == GGML_TYPE_Q8_0) {
|
||||||
|
weight_elem_size = float(sizeof(uint8_t));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
||||||
|
}
|
||||||
|
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
||||||
|
|
||||||
// size of one matrix is element_size * height * width.
|
// size of one matrix is element_size * height * width.
|
||||||
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
||||||
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
||||||
|
|
||||||
// scale stored at the end of weight. Also need transpose.
|
// scale stored at the end of weight. Also need transpose.
|
||||||
|
GGML_ASSERT(QK4_0 == QK8_0);
|
||||||
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
||||||
size_t scale_elem_size = sizeof(uint16_t);
|
size_t scale_elem_size = sizeof(uint16_t);
|
||||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||||
|
@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
||||||
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
||||||
input_elem_size, input_ne, input_nb, 2);
|
input_elem_size, input_ne, input_nb, 2);
|
||||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||||
(char*)src0->data + batch0 * weight_stride, ACL_INT8,
|
(char*)src0->data + batch0 * weight_stride,
|
||||||
weight_elem_size, weight_ne, weight_nb, 2);
|
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||||
|
weight_nb, 2);
|
||||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||||
scale_elem_size, scale_ne, scale_nb, 2);
|
scale_elem_size, scale_ne, scale_nb, 2);
|
||||||
|
@ -2485,11 +2625,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
ggml_cann_mat_mul_fp(ctx, dst);
|
ggml_cann_mat_mul_fp(ctx, dst);
|
||||||
break;
|
break;
|
||||||
// case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
// ggml_cann_mul_mat_q4_0(ctx, dst);
|
|
||||||
// break;
|
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
ggml_cann_mul_mat_q8_0(ctx, dst);
|
ggml_cann_mul_mat_quant(ctx, dst, type);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
|
|
@ -9,6 +9,7 @@ file(GLOB SRC_FILES
|
||||||
get_row_q8_0.cpp
|
get_row_q8_0.cpp
|
||||||
quantize_f32_q8_0.cpp
|
quantize_f32_q8_0.cpp
|
||||||
quantize_f16_q8_0.cpp
|
quantize_f16_q8_0.cpp
|
||||||
|
quantize_float_to_q4_0.cpp
|
||||||
dup.cpp
|
dup.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
|
||||||
${SRC_FILES}
|
${SRC_FILES}
|
||||||
)
|
)
|
||||||
|
|
||||||
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
||||||
|
|
|
@ -8,6 +8,8 @@
|
||||||
|
|
||||||
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
||||||
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
||||||
|
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
||||||
|
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
||||||
|
|
||||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
||||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
||||||
|
|
278
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
Normal file
278
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
Normal file
|
@ -0,0 +1,278 @@
|
||||||
|
#include "kernel_operator.h"
|
||||||
|
|
||||||
|
using namespace AscendC;
|
||||||
|
|
||||||
|
#define BUFFER_NUM 2
|
||||||
|
#define Group_Size 32
|
||||||
|
|
||||||
|
template <typename SRC_T>
|
||||||
|
class QUANTIZE_FLOAT_TO_Q4_0 {
|
||||||
|
public:
|
||||||
|
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
|
||||||
|
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||||
|
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||||
|
int64_t *output_ne_ub) {
|
||||||
|
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
|
||||||
|
// permute=[0,0,0,0]):
|
||||||
|
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
|
||||||
|
int64_t op_block_num = GetBlockNum();
|
||||||
|
int64_t op_block_idx = GetBlockIdx();
|
||||||
|
|
||||||
|
// input stride of data elements
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
input_ne[i] = input_ne_ub[i];
|
||||||
|
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||||
|
output_ne[i] = output_ne_ub[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// output stride of data elements
|
||||||
|
output_stride[0] = 1;
|
||||||
|
for (int i = 1; i < 4; i++) {
|
||||||
|
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
|
||||||
|
scale_ne = input_ne;
|
||||||
|
scale_stride[0] = 1;
|
||||||
|
scale_stride[1] = input_ne[0] / Group_Size;
|
||||||
|
for (int i = 2; i < 4; i++) {
|
||||||
|
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// split input tensor by rows.
|
||||||
|
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||||
|
dr = nr / op_block_num;
|
||||||
|
|
||||||
|
uint64_t tails = nr % op_block_num;
|
||||||
|
if (op_block_idx < tails) {
|
||||||
|
dr += 1;
|
||||||
|
ir = dr * op_block_idx;
|
||||||
|
} else {
|
||||||
|
ir = dr * op_block_idx + tails;
|
||||||
|
}
|
||||||
|
|
||||||
|
group_size_in_row = scale_stride[1];
|
||||||
|
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||||
|
output_ne[3] * sizeof(uint8_t) / 2;
|
||||||
|
|
||||||
|
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
|
||||||
|
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||||
|
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
|
||||||
|
group_size_in_row *
|
||||||
|
sizeof(half)));
|
||||||
|
|
||||||
|
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
|
||||||
|
pipe.InitBuffer(output_queue, BUFFER_NUM,
|
||||||
|
Group_Size * sizeof(int8_t) / 2);
|
||||||
|
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
|
||||||
|
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
|
||||||
|
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
|
||||||
|
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
|
||||||
|
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
|
||||||
|
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
|
||||||
|
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline void copy_in(uint32_t offset) {
|
||||||
|
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
|
||||||
|
DataCopy(input_local, input_gm[offset], Group_Size);
|
||||||
|
input_queue.EnQue(input_local);
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline void copy_out(uint32_t offset) {
|
||||||
|
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
|
||||||
|
// and using DataCopyPad to avoid 32 bits align.
|
||||||
|
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
|
||||||
|
LocalTensor<int8_t> output_int8_local =
|
||||||
|
output_local.ReinterpretCast<int8_t>();
|
||||||
|
|
||||||
|
DataCopyExtParams dataCopyParams;
|
||||||
|
dataCopyParams.blockCount = 1;
|
||||||
|
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
|
||||||
|
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
|
||||||
|
|
||||||
|
output_queue.FreeTensor(output_local);
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||||
|
LocalTensor<float> input_local) {
|
||||||
|
DataCopy(cast_local, input_local, Group_Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||||
|
LocalTensor<half> input_local) {
|
||||||
|
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||||
|
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||||
|
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||||
|
const int64_t i1 =
|
||||||
|
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||||
|
|
||||||
|
const int64_t input_offset = i1 * input_stride[1] +
|
||||||
|
i2 * input_stride[2] +
|
||||||
|
i3 * input_stride[3] + Group_Size * group;
|
||||||
|
|
||||||
|
// output_offset is stride for output_gm which datatype is int8_t and
|
||||||
|
// divided by 2 is needed for int4b_t.
|
||||||
|
const int64_t output_offset = (i1 * output_stride[1] +
|
||||||
|
i2 * output_stride[2] +
|
||||||
|
i3 * output_stride[3] +
|
||||||
|
Group_Size * group) / 2;
|
||||||
|
copy_in(input_offset);
|
||||||
|
|
||||||
|
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
|
||||||
|
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
|
||||||
|
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||||
|
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||||
|
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||||
|
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
|
||||||
|
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
|
||||||
|
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
|
||||||
|
|
||||||
|
input_to_cast(cast_local, input_local);
|
||||||
|
|
||||||
|
ReduceMax(max_local, cast_local, work_local, Group_Size);
|
||||||
|
ReduceMin(min_local, cast_local, work_local, Group_Size);
|
||||||
|
const float max_value = max_local.GetValue(0);
|
||||||
|
const float min_value = min_local.GetValue(0);
|
||||||
|
float d = max_value;
|
||||||
|
if (min_value < 0 && (-1 * min_value) > max_value) {
|
||||||
|
d = min_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
d = d / (-8);
|
||||||
|
if (d != 0) {
|
||||||
|
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
|
||||||
|
float scalar = 8.5f;
|
||||||
|
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||||
|
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
|
||||||
|
scalar = 15.0f;
|
||||||
|
Mins(cast_local, cast_local, scalar, Group_Size);
|
||||||
|
scalar = -8.0f;
|
||||||
|
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||||
|
|
||||||
|
// float->half->int4b
|
||||||
|
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
|
||||||
|
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
|
||||||
|
|
||||||
|
output_queue.EnQue(output_local);
|
||||||
|
copy_out(output_offset);
|
||||||
|
|
||||||
|
input_queue.FreeTensor(input_local);
|
||||||
|
work_queue.FreeTensor(work_local);
|
||||||
|
max_queue.FreeTensor(max_local);
|
||||||
|
min_queue.FreeTensor(min_local);
|
||||||
|
int8_queue.FreeTensor(int8_local);
|
||||||
|
half_queue.FreeTensor(half_local);
|
||||||
|
cast_queue.FreeTensor(cast_local);
|
||||||
|
return (half)d;
|
||||||
|
}
|
||||||
|
|
||||||
|
__aicore__ inline void calculate() {
|
||||||
|
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||||
|
uint32_t scale_local_offset = 0;
|
||||||
|
uint32_t scale_global_offset = 0;
|
||||||
|
for (int64_t i = ir; i < ir + dr; i++) {
|
||||||
|
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||||
|
half scale = calculate_group(i, j);
|
||||||
|
scale_local.SetValue(scale_local_offset++, scale);
|
||||||
|
// Copy Group_Size/2 length data each time.
|
||||||
|
if (scale_local_offset == Group_Size / 2) {
|
||||||
|
scale_local_offset = 0;
|
||||||
|
// TODO: OPTIMIZE ME
|
||||||
|
pipe_barrier(PIPE_ALL);
|
||||||
|
DataCopy(scale_gm[scale_global_offset], scale_local,
|
||||||
|
Group_Size / 2);
|
||||||
|
pipe_barrier(PIPE_ALL);
|
||||||
|
scale_global_offset += Group_Size / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scale_local_offset != 0) {
|
||||||
|
pipe_barrier(PIPE_ALL);
|
||||||
|
DataCopyExtParams dataCopyParams;
|
||||||
|
dataCopyParams.blockCount = 1;
|
||||||
|
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||||
|
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||||
|
dataCopyParams);
|
||||||
|
pipe_barrier(PIPE_ALL);
|
||||||
|
}
|
||||||
|
scale_queue.FreeTensor(scale_local);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int64_t input_ne[4];
|
||||||
|
size_t input_stride[4];
|
||||||
|
|
||||||
|
int64_t *scale_ne;
|
||||||
|
size_t scale_stride[4];
|
||||||
|
|
||||||
|
int64_t output_ne[4];
|
||||||
|
size_t output_stride[4];
|
||||||
|
|
||||||
|
int64_t group_size_in_row;
|
||||||
|
|
||||||
|
int64_t ir;
|
||||||
|
int64_t dr;
|
||||||
|
|
||||||
|
TPipe pipe;
|
||||||
|
GlobalTensor<SRC_T> input_gm;
|
||||||
|
GlobalTensor<half> scale_gm;
|
||||||
|
GlobalTensor<int8_t> output_gm;
|
||||||
|
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||||
|
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
|
||||||
|
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||||
|
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||||
|
auto ub_ptr = (uint8_t *)(ub);
|
||||||
|
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||||
|
*ub_ptr = *gm_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
int64_t input_ne_ub[4];
|
||||||
|
size_t input_nb_ub[4];
|
||||||
|
int64_t output_ne_ub[4];
|
||||||
|
|
||||||
|
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||||
|
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||||
|
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||||
|
|
||||||
|
QUANTIZE_FLOAT_TO_Q4_0<half> op;
|
||||||
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
|
op.calculate();
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||||
|
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||||
|
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||||
|
int64_t input_ne_ub[4];
|
||||||
|
size_t input_nb_ub[4];
|
||||||
|
int64_t output_ne_ub[4];
|
||||||
|
|
||||||
|
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||||
|
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||||
|
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||||
|
|
||||||
|
QUANTIZE_FLOAT_TO_Q4_0<float> op;
|
||||||
|
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||||
|
op.calculate();
|
||||||
|
}
|
|
@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
|
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
|
||||||
static const float eps = 1e-6f;
|
|
||||||
if (group_size < 1024) {
|
if (group_size < 1024) {
|
||||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||||
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
||||||
|
@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
int num_groups = dst->op_params[0];
|
int num_groups = dst->op_params[0];
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||||
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
|
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
|
@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||||
/**
|
/**
|
||||||
* Converts float32 to brain16.
|
* Converts float32 to brain16.
|
||||||
*
|
*
|
||||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
* This is binary identical with Google Brain float conversion.
|
||||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||||
|
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||||
* This code should vectorize nicely if using modern compilers.
|
* This code should vectorize nicely if using modern compilers.
|
||||||
*/
|
*/
|
||||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
|
@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
|
||||||
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
@ -146,6 +143,7 @@ extern "C" {
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
#include <arm_sve.h>
|
#include <arm_sve.h>
|
||||||
|
#include <sys/prctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
|
|
|
@ -2229,10 +2229,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
//float eps;
|
float eps;
|
||||||
//memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
const float eps = 1e-6f; // TODO: temporarily hardcoded
|
|
||||||
|
|
||||||
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
|
|
|
@ -3819,7 +3819,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntb() == QK8_0) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||||
|
|
||||||
|
@ -5304,7 +5304,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntb() == QK8_0) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
|
|
@ -127,6 +127,10 @@ void iq2xs_free_impl(enum ggml_type type);
|
||||||
void iq3xs_init_impl(int grid_size);
|
void iq3xs_init_impl(int grid_size);
|
||||||
void iq3xs_free_impl(int grid_size);
|
void iq3xs_free_impl(int grid_size);
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
extern int ggml_sve_cnt_b;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void group_norm_f32_sycl(const float* x, float* dst,
|
static void group_norm_f32_sycl(const float* x, float* dst,
|
||||||
const int num_groups, const int group_size,
|
const int num_groups, const float eps, const int group_size,
|
||||||
const int ne_elements, queue_ptr stream, int device) {
|
const int ne_elements, queue_ptr stream, int device) {
|
||||||
static const float eps = 1e-6f;
|
|
||||||
if (group_size < 1024) {
|
if (group_size < 1024) {
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
int num_groups = dst->op_params[0];
|
int num_groups = dst->op_params[0];
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||||
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
|
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
|
||||||
|
|
||||||
(void)src1;
|
(void)src1;
|
||||||
(void)dst;
|
(void)dst;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -19,7 +19,6 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
@ -181,7 +180,7 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
|
vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
|
||||||
vk_pipeline pipeline_mul_f32;
|
vk_pipeline pipeline_mul_f32;
|
||||||
vk_pipeline pipeline_div_f32;
|
vk_pipeline pipeline_div_f32;
|
||||||
vk_pipeline pipeline_concat_f32, pipeline_concat_i32;
|
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
||||||
vk_pipeline pipeline_upscale_f32;
|
vk_pipeline pipeline_upscale_f32;
|
||||||
vk_pipeline pipeline_scale_f32;
|
vk_pipeline pipeline_scale_f32;
|
||||||
vk_pipeline pipeline_sqr_f32;
|
vk_pipeline pipeline_sqr_f32;
|
||||||
|
@ -195,6 +194,7 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_gelu_quick_f32;
|
vk_pipeline pipeline_gelu_quick_f32;
|
||||||
vk_pipeline pipeline_silu_f32;
|
vk_pipeline pipeline_silu_f32;
|
||||||
vk_pipeline pipeline_relu_f32;
|
vk_pipeline pipeline_relu_f32;
|
||||||
|
vk_pipeline pipeline_leaky_relu_f32;
|
||||||
vk_pipeline pipeline_tanh_f32;
|
vk_pipeline pipeline_tanh_f32;
|
||||||
vk_pipeline pipeline_diag_mask_inf_f32;
|
vk_pipeline pipeline_diag_mask_inf_f32;
|
||||||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||||
|
@ -406,7 +406,7 @@ struct vk_context_struct {
|
||||||
vk_submission * s;
|
vk_submission * s;
|
||||||
std::vector<vk_sequence> seqs;
|
std::vector<vk_sequence> seqs;
|
||||||
|
|
||||||
ggml_tensor * exit_tensor;
|
int exit_tensor_idx;
|
||||||
|
|
||||||
std::vector<vk_staging_memcpy> in_memcpys;
|
std::vector<vk_staging_memcpy> in_memcpys;
|
||||||
std::vector<vk_staging_memcpy> out_memcpys;
|
std::vector<vk_staging_memcpy> out_memcpys;
|
||||||
|
@ -494,7 +494,7 @@ struct ggml_backend_vk_context {
|
||||||
vk_context_ref compute_ctx;
|
vk_context_ref compute_ctx;
|
||||||
vk_context_ref transfer_ctx;
|
vk_context_ref transfer_ctx;
|
||||||
|
|
||||||
std::unordered_map<ggml_tensor *, vk_context_ref> tensor_ctxs;
|
std::vector<vk_context_ref> tensor_ctxs;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||||
|
@ -1646,6 +1646,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
@ -1662,6 +1663,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
@ -2106,9 +2108,9 @@ void ggml_vk_instance_init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
|
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
||||||
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
|
||||||
ggml_vk_instance_init();
|
ggml_vk_instance_init();
|
||||||
GGML_ASSERT(idx < vk_instance.device_indices.size());
|
|
||||||
|
|
||||||
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
ctx->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
|
@ -2158,7 +2160,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) {
|
||||||
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()");
|
VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_f32;
|
return ctx->device->pipeline_matmul_f32;
|
||||||
}
|
}
|
||||||
|
@ -3948,6 +3950,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_concat_f32;
|
return ctx->device->pipeline_concat_f32;
|
||||||
}
|
}
|
||||||
|
if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||||
|
return ctx->device->pipeline_concat_f16;
|
||||||
|
}
|
||||||
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||||
return ctx->device->pipeline_concat_i32;
|
return ctx->device->pipeline_concat_i32;
|
||||||
}
|
}
|
||||||
|
@ -4087,6 +4092,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
return ctx->device->pipeline_timestep_embedding_f32;
|
return ctx->device->pipeline_timestep_embedding_f32;
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
|
return ctx->device->pipeline_leaky_relu_f32;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -4754,6 +4764,11 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
const float * op_params = (const float *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_RUN_TESTS
|
#ifdef GGML_VULKAN_RUN_TESTS
|
||||||
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
||||||
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
||||||
|
@ -5431,7 +5446,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||||
|
|
||||||
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
||||||
|
|
||||||
bool mmp = (use_src0 && use_src1) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
bool mmp = (use_src0 && use_src1 && (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID)) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
||||||
|
|
||||||
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig);
|
||||||
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
||||||
|
@ -5485,6 +5500,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(node)) {
|
switch (ggml_get_unary_op(node)) {
|
||||||
|
@ -5701,7 +5717,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node){
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||||
|
|
||||||
if (ggml_is_empty(node) || extra == nullptr) {
|
if (ggml_is_empty(node) || extra == nullptr) {
|
||||||
|
@ -5762,6 +5778,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||||
|
@ -5882,6 +5899,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
|
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
|
||||||
|
|
||||||
|
break;
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node);
|
ggml_vk_mul_mat(ctx, compute_ctx, src0, src1, node);
|
||||||
|
@ -5895,7 +5916,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->tensor_ctxs[node] = compute_ctx;
|
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
// Force context reset on each node so that each tensor ends up in its own context
|
// Force context reset on each node so that each tensor ends up in its own context
|
||||||
|
@ -5905,12 +5926,12 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
if (last_node) {
|
if (last_node) {
|
||||||
ggml_vk_ctx_end(compute_ctx);
|
ggml_vk_ctx_end(compute_ctx);
|
||||||
compute_ctx->exit_tensor = node;
|
compute_ctx->exit_tensor_idx = node_idx;
|
||||||
ctx->compute_ctx.reset();
|
ctx->compute_ctx.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
|
||||||
ggml_tensor_extra_gpu * extra = nullptr;
|
ggml_tensor_extra_gpu * extra = nullptr;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
@ -5942,6 +5963,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
|
||||||
|
@ -5978,7 +6000,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
ggml_vk_check_results_0(tensor);
|
ggml_vk_check_results_0(tensor);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
vk_context subctx = ctx->tensor_ctxs[tensor].lock();
|
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
||||||
|
|
||||||
// Only run if ctx hasn't been submitted yet
|
// Only run if ctx hasn't been submitted yet
|
||||||
if (!subctx->seqs.empty()) {
|
if (!subctx->seqs.empty()) {
|
||||||
|
@ -5990,7 +6012,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
ggml_vk_submit(subctx, ctx->fence);
|
ggml_vk_submit(subctx, ctx->fence);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor == subctx->exit_tensor) {
|
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
|
||||||
|
@ -6002,8 +6024,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
subctx->out_memcpys.clear();
|
subctx->out_memcpys.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->tensor_ctxs.erase(tensor);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6496,8 +6516,11 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
||||||
last_node -= 1;
|
last_node -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reserve tensor context space for all nodes
|
||||||
|
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node);
|
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
@ -6507,7 +6530,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ok = ggml_vk_compute_forward(ctx, node);
|
bool ok = ggml_vk_compute_forward(ctx, node, i);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
if (node->op == GGML_OP_UNARY) {
|
if (node->op == GGML_OP_UNARY) {
|
||||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||||
|
@ -6644,6 +6667,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
case GGML_OP_LEAKY_RELU:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -7101,12 +7125,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
||||||
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
||||||
float freq_base = ((float *) tensor->op_params)[5];
|
const float freq_base = ((float *) tensor->op_params)[5];
|
||||||
float freq_scale = ((float *) tensor->op_params)[6];
|
const float freq_scale = ((float *) tensor->op_params)[6];
|
||||||
float ext_factor = ((float *) tensor->op_params)[7];
|
const float ext_factor = ((float *) tensor->op_params)[7];
|
||||||
float attn_factor = ((float *) tensor->op_params)[8];
|
const float attn_factor = ((float *) tensor->op_params)[8];
|
||||||
float beta_fast = ((float *) tensor->op_params)[9];
|
const float beta_fast = ((float *) tensor->op_params)[9];
|
||||||
float beta_slow = ((float *) tensor->op_params)[10];
|
const float beta_slow = ((float *) tensor->op_params)[10];
|
||||||
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
} else if (tensor->op == GGML_OP_UNARY) {
|
} else if (tensor->op == GGML_OP_UNARY) {
|
||||||
switch (ggml_get_unary_op(tensor)) {
|
switch (ggml_get_unary_op(tensor)) {
|
||||||
|
@ -7167,6 +7191,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||||
const int32_t dim = tensor->op_params[0];
|
const int32_t dim = tensor->op_params[0];
|
||||||
const int32_t max_period = tensor->op_params[1];
|
const int32_t max_period = tensor->op_params[1];
|
||||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
||||||
|
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||||
|
const float * op_params = (const float *)tensor->op_params;
|
||||||
|
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
||||||
} else {
|
} else {
|
||||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
|
|
|
@ -37,6 +37,9 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
int ggml_sve_cnt_b = 0;
|
||||||
|
#endif
|
||||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
#undef GGML_USE_LLAMAFILE
|
#undef GGML_USE_LLAMAFILE
|
||||||
#endif
|
#endif
|
||||||
|
@ -488,9 +491,16 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
y[i] = ggml_compute_fp32_to_bf16(x[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
#if defined(__AVX512BF16__)
|
#if defined(__AVX512BF16__)
|
||||||
|
// subnormals are flushed to zero on this platform
|
||||||
for (; i + 32 <= n; i += 32) {
|
for (; i + 32 <= n; i += 32) {
|
||||||
_mm512_storeu_si512(
|
_mm512_storeu_si512(
|
||||||
(__m512i *)(y + i),
|
(__m512i *)(y + i),
|
||||||
|
@ -970,7 +980,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||||
.vec_dot_type = GGML_TYPE_BF16,
|
.vec_dot_type = GGML_TYPE_BF16,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
|
@ -2310,7 +2320,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) {
|
||||||
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
||||||
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
||||||
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
||||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
||||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||||
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
||||||
|
@ -3575,6 +3585,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
|
||||||
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
if (!ggml_sve_cnt_b) {
|
||||||
|
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||||
|
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
|
@ -5382,6 +5398,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_groups,
|
int n_groups,
|
||||||
|
float eps,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
@ -5392,7 +5409,8 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
result->op_params[0] = n_groups;
|
ggml_set_op_params_i32(result, 0, n_groups);
|
||||||
|
ggml_set_op_params_f32(result, 1, eps);
|
||||||
|
|
||||||
result->op = GGML_OP_GROUP_NORM;
|
result->op = GGML_OP_GROUP_NORM;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -5404,15 +5422,17 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
||||||
struct ggml_tensor * ggml_group_norm(
|
struct ggml_tensor * ggml_group_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_groups) {
|
int n_groups,
|
||||||
return ggml_group_norm_impl(ctx, a, n_groups, false);
|
float eps) {
|
||||||
|
return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_group_norm_inplace(
|
struct ggml_tensor * ggml_group_norm_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_groups) {
|
int n_groups,
|
||||||
return ggml_group_norm_impl(ctx, a, n_groups, true);
|
float eps) {
|
||||||
|
return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_mul_mat
|
// ggml_mul_mat
|
||||||
|
@ -12125,10 +12145,11 @@ static void ggml_compute_forward_group_norm_f32(
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
const float eps = 1e-6f; // TODO: make this a parameter
|
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
int n_channels = src0->ne[2];
|
int n_channels = src0->ne[2];
|
||||||
int n_groups = dst->op_params[0];
|
int n_groups = dst->op_params[0];
|
||||||
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
||||||
|
@ -20709,7 +20730,7 @@ size_t ggml_quantize_chunk(
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_bf16_t);
|
size_t elemsize = sizeof(ggml_bf16_t);
|
||||||
ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
|
ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
|
||||||
result = n * elemsize;
|
result = n * elemsize;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
|
|
|
@ -27,5 +27,9 @@ void main() {
|
||||||
|
|
||||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||||
|
|
||||||
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
||||||
|
#else
|
||||||
|
data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
22
ggml/src/vulkan-shaders/leaky_relu.comp
Normal file
22
ggml/src/vulkan-shaders/leaky_relu.comp
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "generic_head.comp"
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (i >= p.KX) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float val = float(data_a[i]);
|
||||||
|
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
|
||||||
|
}
|
|
@ -16,6 +16,13 @@ void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
|
// There are not enough cols to use all threads
|
||||||
|
if (tid >= p.ncols) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint block_size = min(p.ncols, BLOCK_SIZE);
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
@ -23,8 +30,8 @@ void main() {
|
||||||
|
|
||||||
tmp[tid] = FLOAT_TYPE(0.0f);
|
tmp[tid] = FLOAT_TYPE(0.0f);
|
||||||
|
|
||||||
[[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
|
[[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
|
||||||
const uint col = i*BLOCK_SIZE + 2*tid;
|
const uint col = i*block_size + 2*tid;
|
||||||
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
|
||||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||||
const uint iybs = col - col%QUANT_K; // y block start index
|
const uint iybs = col - col%QUANT_K; // y block start index
|
||||||
|
@ -38,7 +45,7 @@ void main() {
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
|
||||||
if (tid < s) {
|
if (tid < s) {
|
||||||
tmp[tid] += tmp[tid + s];
|
tmp[tid] += tmp[tid + s];
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
@ -179,11 +180,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
|
||||||
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _WIN32
|
static const char path_separator = '/';
|
||||||
static const char path_separator = '\\';
|
|
||||||
#else
|
|
||||||
static const char path_separator = '/';
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string join_paths(const std::string& path1, const std::string& path2) {
|
std::string join_paths(const std::string& path1, const std::string& path2) {
|
||||||
return path1 + path_separator + path2;
|
return path1 + path_separator + path2;
|
||||||
|
@ -198,7 +195,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
||||||
std::string out_fname = join_paths(output_dir, name + ".spv");
|
std::string out_fname = join_paths(output_dir, name + ".spv");
|
||||||
std::string in_path = join_paths(input_dir, in_fname);
|
std::string in_path = join_paths(input_dir, in_fname);
|
||||||
|
|
||||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
#ifdef _WIN32
|
||||||
|
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
|
||||||
|
#else
|
||||||
|
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
||||||
|
#endif
|
||||||
for (const auto& define : defines) {
|
for (const auto& define : defines) {
|
||||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||||
}
|
}
|
||||||
|
@ -398,6 +399,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||||
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||||
}));
|
}));
|
||||||
|
@ -418,6 +422,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
@ -476,10 +483,16 @@ void write_output_files() {
|
||||||
|
|
||||||
for (const auto& pair : shader_fnames) {
|
for (const auto& pair : shader_fnames) {
|
||||||
const std::string& name = pair.first;
|
const std::string& name = pair.first;
|
||||||
const std::string& path = pair.second;
|
#ifdef _WIN32
|
||||||
|
std::string path = pair.second;
|
||||||
|
std::replace(path.begin(), path.end(), '/', '\\' );
|
||||||
|
#else
|
||||||
|
const std::string& path = pair.second;
|
||||||
|
#endif
|
||||||
|
|
||||||
FILE* spv = fopen(path.c_str(), "rb");
|
FILE* spv = fopen(path.c_str(), "rb");
|
||||||
if (!spv) {
|
if (!spv) {
|
||||||
std::cerr << "Error opening SPIR-V file: " << path << "\n";
|
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -491,7 +504,7 @@ void write_output_files() {
|
||||||
size_t read_size = fread(data.data(), 1, size, spv);
|
size_t read_size = fread(data.data(), 1, size, spv);
|
||||||
fclose(spv);
|
fclose(spv);
|
||||||
if (read_size != size) {
|
if (read_size != size) {
|
||||||
std::cerr << "Error reading SPIR-V file: " << path << "\n";
|
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -161,6 +161,7 @@ class Keys:
|
||||||
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||||
|
EOM_ID = "tokenizer.ggml.eom_token_id"
|
||||||
|
|
||||||
class Adapter:
|
class Adapter:
|
||||||
TYPE = "adapter.type"
|
TYPE = "adapter.type"
|
||||||
|
@ -1327,3 +1328,4 @@ KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
||||||
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
||||||
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
||||||
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
||||||
|
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
||||||
|
|
|
@ -828,6 +828,9 @@ class GGUFWriter:
|
||||||
def add_eot_token_id(self, id: int) -> None:
|
def add_eot_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||||
|
|
||||||
|
def add_eom_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -284,20 +284,67 @@ class Metadata:
|
||||||
########################
|
########################
|
||||||
if model_card is not None:
|
if model_card is not None:
|
||||||
|
|
||||||
if "model_name" in model_card and metadata.name is None:
|
def use_model_card_metadata(metadata_key: str, model_card_key: str):
|
||||||
# Not part of huggingface model card standard but notice some model creator using it
|
if model_card_key in model_card and getattr(metadata, metadata_key, None) is None:
|
||||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
setattr(metadata, metadata_key, model_card.get(model_card_key))
|
||||||
metadata.name = model_card.get("model_name")
|
|
||||||
|
|
||||||
if "model_creator" in model_card and metadata.author is None:
|
def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
|
||||||
# Not part of huggingface model card standard but notice some model creator using it
|
# Note: Will append rather than replace if already exist
|
||||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
tags_value = model_card.get(model_card_key, None)
|
||||||
metadata.author = model_card.get("model_creator")
|
if tags_value is None:
|
||||||
|
return
|
||||||
|
|
||||||
if "model_type" in model_card and metadata.basename is None:
|
current_value = getattr(metadata, metadata_key, None)
|
||||||
# Not part of huggingface model card standard but notice some model creator using it
|
if current_value is None:
|
||||||
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
current_value = []
|
||||||
metadata.basename = model_card.get("model_type")
|
|
||||||
|
if isinstance(tags_value, str):
|
||||||
|
current_value.append(tags_value)
|
||||||
|
elif isinstance(tags_value, list):
|
||||||
|
current_value.extend(tags_value)
|
||||||
|
|
||||||
|
setattr(metadata, metadata_key, current_value)
|
||||||
|
|
||||||
|
# LLAMA.cpp's direct internal convention
|
||||||
|
# (Definitely not part of hugging face formal/informal standard)
|
||||||
|
#########################################
|
||||||
|
use_model_card_metadata("name", "name")
|
||||||
|
use_model_card_metadata("author", "author")
|
||||||
|
use_model_card_metadata("version", "version")
|
||||||
|
use_model_card_metadata("organization", "organization")
|
||||||
|
use_model_card_metadata("description", "description")
|
||||||
|
use_model_card_metadata("finetune", "finetune")
|
||||||
|
use_model_card_metadata("basename", "basename")
|
||||||
|
use_model_card_metadata("size_label", "size_label")
|
||||||
|
use_model_card_metadata("source_url", "url")
|
||||||
|
use_model_card_metadata("source_doi", "doi")
|
||||||
|
use_model_card_metadata("source_uuid", "uuid")
|
||||||
|
use_model_card_metadata("source_repo_url", "repo_url")
|
||||||
|
|
||||||
|
# LLAMA.cpp's huggingface style convention
|
||||||
|
# (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
|
||||||
|
###########################################
|
||||||
|
use_model_card_metadata("name", "model_name")
|
||||||
|
use_model_card_metadata("author", "model_author")
|
||||||
|
use_model_card_metadata("version", "model_version")
|
||||||
|
use_model_card_metadata("organization", "model_organization")
|
||||||
|
use_model_card_metadata("description", "model_description")
|
||||||
|
use_model_card_metadata("finetune", "model_finetune")
|
||||||
|
use_model_card_metadata("basename", "model_basename")
|
||||||
|
use_model_card_metadata("size_label", "model_size_label")
|
||||||
|
use_model_card_metadata("source_url", "model_url")
|
||||||
|
use_model_card_metadata("source_doi", "model_doi")
|
||||||
|
use_model_card_metadata("source_uuid", "model_uuid")
|
||||||
|
use_model_card_metadata("source_repo_url", "model_repo_url")
|
||||||
|
|
||||||
|
# Hugging Face Direct Convention
|
||||||
|
#################################
|
||||||
|
|
||||||
|
# Not part of huggingface model card standard but notice some model creator using it
|
||||||
|
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||||
|
use_model_card_metadata("name", "model_name")
|
||||||
|
use_model_card_metadata("author", "model_creator")
|
||||||
|
use_model_card_metadata("basename", "model_type")
|
||||||
|
|
||||||
if "base_model" in model_card:
|
if "base_model" in model_card:
|
||||||
# This represents the parent models that this is based on
|
# This represents the parent models that this is based on
|
||||||
|
@ -329,58 +376,18 @@ class Metadata:
|
||||||
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
|
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
|
||||||
metadata.base_models.append(base_model)
|
metadata.base_models.append(base_model)
|
||||||
|
|
||||||
if "license" in model_card and metadata.license is None:
|
use_model_card_metadata("license", "license")
|
||||||
metadata.license = model_card.get("license")
|
use_model_card_metadata("license_name", "license_name")
|
||||||
|
use_model_card_metadata("license_link", "license_link")
|
||||||
|
|
||||||
if "license_name" in model_card and metadata.license_name is None:
|
use_array_model_card_metadata("tags", "tags")
|
||||||
metadata.license_name = model_card.get("license_name")
|
use_array_model_card_metadata("tags", "pipeline_tag")
|
||||||
|
|
||||||
if "license_link" in model_card and metadata.license_link is None:
|
use_array_model_card_metadata("languages", "languages")
|
||||||
metadata.license_link = model_card.get("license_link")
|
use_array_model_card_metadata("languages", "language")
|
||||||
|
|
||||||
tags_value = model_card.get("tags", None)
|
use_array_model_card_metadata("datasets", "datasets")
|
||||||
if tags_value is not None:
|
use_array_model_card_metadata("datasets", "dataset")
|
||||||
|
|
||||||
if metadata.tags is None:
|
|
||||||
metadata.tags = []
|
|
||||||
|
|
||||||
if isinstance(tags_value, str):
|
|
||||||
metadata.tags.append(tags_value)
|
|
||||||
elif isinstance(tags_value, list):
|
|
||||||
metadata.tags.extend(tags_value)
|
|
||||||
|
|
||||||
pipeline_tags_value = model_card.get("pipeline_tag", None)
|
|
||||||
if pipeline_tags_value is not None:
|
|
||||||
|
|
||||||
if metadata.tags is None:
|
|
||||||
metadata.tags = []
|
|
||||||
|
|
||||||
if isinstance(pipeline_tags_value, str):
|
|
||||||
metadata.tags.append(pipeline_tags_value)
|
|
||||||
elif isinstance(pipeline_tags_value, list):
|
|
||||||
metadata.tags.extend(pipeline_tags_value)
|
|
||||||
|
|
||||||
language_value = model_card.get("languages", model_card.get("language", None))
|
|
||||||
if language_value is not None:
|
|
||||||
|
|
||||||
if metadata.languages is None:
|
|
||||||
metadata.languages = []
|
|
||||||
|
|
||||||
if isinstance(language_value, str):
|
|
||||||
metadata.languages.append(language_value)
|
|
||||||
elif isinstance(language_value, list):
|
|
||||||
metadata.languages.extend(language_value)
|
|
||||||
|
|
||||||
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
|
|
||||||
if dataset_value is not None:
|
|
||||||
|
|
||||||
if metadata.datasets is None:
|
|
||||||
metadata.datasets = []
|
|
||||||
|
|
||||||
if isinstance(dataset_value, str):
|
|
||||||
metadata.datasets.append(dataset_value)
|
|
||||||
elif isinstance(dataset_value, list):
|
|
||||||
metadata.datasets.extend(dataset_value)
|
|
||||||
|
|
||||||
# Hugging Face Parameter Heuristics
|
# Hugging Face Parameter Heuristics
|
||||||
####################################
|
####################################
|
||||||
|
|
|
@ -25,14 +25,12 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
|
||||||
|
|
||||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
n = n.astype(np.float32, copy=False).view(np.uint32)
|
||||||
# force nan to quiet
|
# force nan to quiet
|
||||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
||||||
# flush subnormals to zero
|
|
||||||
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
|
||||||
# round to nearest even
|
# round to nearest even
|
||||||
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||||
return n.astype(np.int16)
|
return n.astype(np.uint16)
|
||||||
|
|
||||||
|
|
||||||
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||||
|
@ -49,10 +47,10 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
||||||
|
|
||||||
|
|
||||||
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
||||||
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
|
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
|
||||||
|
|
||||||
|
|
||||||
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
|
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
|
||||||
|
|
||||||
|
|
||||||
def quantize_bf16(n: np.ndarray):
|
def quantize_bf16(n: np.ndarray):
|
||||||
|
|
|
@ -434,7 +434,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
|
||||||
|
|
||||||
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
|
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
|
||||||
struct ggml_tensor* a) {
|
struct ggml_tensor* a) {
|
||||||
return ggml_group_norm(ctx, a, 32);
|
return ggml_group_norm(ctx, a, 32, 1e-6f);
|
||||||
}
|
}
|
||||||
|
|
||||||
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
|
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
|
||||||
|
@ -575,7 +575,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
|
||||||
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
x = ggml_group_norm(ctx, x, num_groups);
|
x = ggml_group_norm(ctx, x, num_groups, 1e-6f);
|
||||||
if (w != NULL && b != NULL) {
|
if (w != NULL && b != NULL) {
|
||||||
x = ggml_mul(ctx, x, w);
|
x = ggml_mul(ctx, x, w);
|
||||||
// b = ggml_repeat(ctx, b, x);
|
// b = ggml_repeat(ctx, b, x);
|
||||||
|
|
|
@ -18,17 +18,14 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
std::string result;
|
if (search.empty()) {
|
||||||
for (size_t pos = 0; ; pos += search.length()) {
|
return; // Avoid infinite loop if 'search' is an empty string
|
||||||
auto new_pos = s.find(search, pos);
|
}
|
||||||
if (new_pos == std::string::npos) {
|
size_t pos = 0;
|
||||||
result += s.substr(pos, s.size() - pos);
|
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||||
break;
|
s.replace(pos, search.length(), replace);
|
||||||
}
|
pos += replace.length();
|
||||||
result += s.substr(pos, new_pos - pos) + replace;
|
|
||||||
pos = new_pos;
|
|
||||||
}
|
}
|
||||||
s = std::move(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||||
|
@ -1044,6 +1041,9 @@ struct llm_tokenizer_ugm {
|
||||||
* the best tokenization.
|
* the best tokenization.
|
||||||
*/
|
*/
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
|
// get current size of output (for reversal later)
|
||||||
|
size_t output_size = output.size();
|
||||||
|
|
||||||
// normalize the input first
|
// normalize the input first
|
||||||
std::string normalized;
|
std::string normalized;
|
||||||
normalize(text, &normalized);
|
normalize(text, &normalized);
|
||||||
|
@ -1123,7 +1123,7 @@ struct llm_tokenizer_ugm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// reverse the output since we added tokens starting from the end of the input
|
// reverse the output since we added tokens starting from the end of the input
|
||||||
std::reverse(output.begin(), output.end());
|
std::reverse(output.begin() + output_size, output.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -1701,7 +1701,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
||||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||||
return token != -1 && (
|
return token != -1 && (
|
||||||
token == llama_token_eos_impl(vocab) ||
|
token == llama_token_eos_impl(vocab) ||
|
||||||
token == llama_token_eot_impl(vocab)
|
token == llama_token_eot_impl(vocab) ||
|
||||||
|
token == llama_token_eom_impl(vocab)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1757,6 +1758,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_eot_id;
|
return vocab.special_eot_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_eom_id;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_tokenize_impl(
|
int32_t llama_tokenize_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
const char * text,
|
const char * text,
|
||||||
|
|
|
@ -45,6 +45,7 @@ struct llama_vocab {
|
||||||
id special_suffix_id = -1;
|
id special_suffix_id = -1;
|
||||||
id special_middle_id = -1;
|
id special_middle_id = -1;
|
||||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||||
|
id special_eom_id = -1;
|
||||||
|
|
||||||
// tokenizer flags
|
// tokenizer flags
|
||||||
bool tokenizer_add_space_prefix = false;
|
bool tokenizer_add_space_prefix = false;
|
||||||
|
@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
||||||
|
|
||||||
int32_t llama_tokenize_impl(
|
int32_t llama_tokenize_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
|
|
|
@ -357,6 +357,7 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_SUFFIX_ID,
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
||||||
LLM_KV_TOKENIZER_MIDDLE_ID,
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
||||||
LLM_KV_TOKENIZER_EOT_ID,
|
LLM_KV_TOKENIZER_EOT_ID,
|
||||||
|
LLM_KV_TOKENIZER_EOM_ID,
|
||||||
|
|
||||||
LLM_KV_ADAPTER_TYPE,
|
LLM_KV_ADAPTER_TYPE,
|
||||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||||
|
@ -454,6 +455,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
||||||
|
|
||||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||||
|
@ -5623,6 +5625,7 @@ static void llm_load_vocab(
|
||||||
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
||||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
||||||
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
||||||
|
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto & it : special_token_types) {
|
for (const auto & it : special_token_types) {
|
||||||
|
@ -5675,6 +5678,17 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// find EOM token: "<|eom_id|>"
|
||||||
|
//
|
||||||
|
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
||||||
|
// for now, we apply this workaround to find the EOM token based on its text
|
||||||
|
if (vocab.special_eom_id == -1) {
|
||||||
|
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
||||||
|
if (t != vocab.token_to_id.end()) {
|
||||||
|
vocab.special_eom_id = t->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// build special tokens cache
|
// build special tokens cache
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue