rewritten checkpoint 1 - before coopmat

This commit is contained in:
Concedo 2024-12-13 16:55:23 +08:00
commit 4c4ce5e808
59 changed files with 9147 additions and 28724 deletions

View file

@ -349,6 +349,18 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
return true; return true;
} }
static std::string list_builtin_chat_templates() {
std::vector<const char *> supported_tmpl;
int32_t res = llama_chat_builtin_templates(nullptr, 0);
supported_tmpl.resize(res);
res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
std::ostringstream msg;
for (auto & tmpl : supported_tmpl) {
msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
}
return msg.str();
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// load dynamic backends // load dynamic backends
ggml_backend_load_all(); ggml_backend_load_all();
@ -775,7 +787,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(
@ -1815,9 +1827,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE", {"--chat-template"}, "JINJA_TEMPLATE",
string_format(
"set custom jinja chat template (default: template taken from model's metadata)\n" "set custom jinja chat template (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n" "if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
if (!common_chat_verify_template(value)) { if (!common_chat_verify_template(value)) {
throw std::runtime_error(string_format( throw std::runtime_error(string_format(

View file

@ -129,6 +129,7 @@ struct common_params_sampling {
bool penalize_nl = false; // consider newlines as a repeatable token bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
bool timing_per_token = false;
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
@ -210,7 +211,7 @@ struct common_params {
struct common_params_speculative speculative; struct common_params_speculative speculative;
std::string model = ""; // model path // NOLINT std::string model = ""; // model path // NOLINT
std::string model_alias = "unknown"; // model alias // NOLINT std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT std::string hf_repo = ""; // HF repo // NOLINT

View file

@ -658,6 +658,12 @@ class Model:
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
# ref: https://huggingface.co/facebook/chameleon-7b # ref: https://huggingface.co/facebook/chameleon-7b
res = "chameleon" res = "chameleon"
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
res = "roberta-bpe"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -1831,29 +1837,40 @@ class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM model_arch = gguf.MODEL_ARCH.MINICPM
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] super().set_gguf_parameters()
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) embedding_scale = float(self.hparams["scale_emb"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_embedding_scale(embedding_scale)
self.gguf_writer.add_block_count(block_count) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_residual_scale(residual_scale)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_logit_scale(logit_scale)
self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
if self.hparams.get("rope_scaling") is not None:
if self.hparams["rope_scaling"].get("type") == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
def set_vocab(self): def set_vocab(self):
self._set_vocab_llama_hf() self._set_vocab_sentencepiece()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
@ -1863,9 +1880,9 @@ class MiniCPMModel(Model):
# HF models permute some of the tensors, so we need to undo that # HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")): if name.endswith(("q_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight")): if name.endswith(("k_proj.weight")):
data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ -2519,7 +2536,7 @@ class InternLM2Model(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("BertModel", "CamembertModel") @Model.register("BertModel", "CamembertModel", "RobertaModel")
class BertModel(Model): class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT model_arch = gguf.MODEL_ARCH.BERT
@ -2560,7 +2577,8 @@ class BertModel(Model):
# we need this to validate the size of the token_type embeddings # we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings # though currently we are passing all zeros to the token_type embeddings
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # "Sequence A" or "Sequence B"
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab # convert to phantom space vocab
def phantom(tok): def phantom(tok):

View file

@ -17,7 +17,7 @@
# #
# python3 convert_hf_to_gguf_update.py <huggingface_token> # python3 convert_hf_to_gguf_update.py <huggingface_token>
# #
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
@ -102,6 +102,8 @@ models = [
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
] ]

View file

@ -12,6 +12,10 @@
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif #endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
@ -1172,6 +1176,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("%s: CLIP using Vulkan backend\n", __func__); LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif #endif
#ifdef GGML_USE_SYCL
new_clip->backend = ggml_backend_sycl_init(0);
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
#endif
if (!new_clip->backend) { if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init(); new_clip->backend = ggml_backend_cpu_init();
LOG_INF("%s: CLIP using CPU backend\n", __func__); LOG_INF("%s: CLIP using CPU backend\n", __func__);

View file

@ -1,25 +0,0 @@
#!/bin/bash
# Download and update deps for binary
# get the directory of this script file
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public
echo "download js bundle files"
# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
echo >> $PUBLIC/deps_tailwindcss.js # add newline
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
echo >> $PUBLIC/deps_daisyui.min.css # add newline
curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
echo >> $PUBLIC/deps_markdown-it.js # add newline
ls -lah $PUBLIC

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -407,6 +407,9 @@ class SimpleChat {
if (curLine.startsWith("data:")) { if (curLine.startsWith("data:")) {
curLine = curLine.substring(5); curLine = curLine.substring(5);
} }
if (curLine.trim() === "[DONE]") {
break;
}
let curJson = JSON.parse(curLine); let curJson = JSON.parse(curLine);
console.debug("DBUG:SC:PART:Json:", curJson); console.debug("DBUG:SC:PART:Json:", curJson);
this.append_response(this.response_extract_stream(curJson, apiEP)); this.append_response(this.response_extract_stream(curJson, apiEP));

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,9 @@
#!/bin/bash #!/bin/bash
# make sure we are in the right directory
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
set -eu set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]

View file

@ -12,13 +12,13 @@ def create_server():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
[ [
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
] ]
) )
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server global server
server.start() server.start()
res = server.make_request("POST", "/chat/completions", data={ res = server.make_request("POST", "/chat/completions", data={
@ -30,29 +30,27 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
], ],
}) })
assert res.status_code == 200 assert res.status_code == 200
assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0] choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"] assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"]) assert match_regex(re_content, choice["message"]["content"])
if truncated: assert choice["finish_reason"] == finish_reason
assert choice["finish_reason"] == "length"
else:
assert choice["finish_reason"] == "stop"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
[ [
("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
] ]
) )
def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server global server
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
server.start() server.start()
res = server.make_stream_request("POST", "/chat/completions", data={ res = server.make_stream_request("POST", "/chat/completions", data={
"model": model,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"messages": [ "messages": [
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
@ -63,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r
content = "" content = ""
for data in res: for data in res:
choice = data["choices"][0] choice = data["choices"][0]
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
if choice["finish_reason"] in ["stop", "length"]: if choice["finish_reason"] in ["stop", "length"]:
assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["prompt_tokens"] == n_prompt
assert data["usage"]["completion_tokens"] == n_predicted assert data["usage"]["completion_tokens"] == n_predicted
assert "content" not in choice["delta"] assert "content" not in choice["delta"]
assert match_regex(re_content, content) assert match_regex(re_content, content)
# FIXME: not sure why this is incorrect in stream mode assert choice["finish_reason"] == finish_reason
# if truncated:
# assert choice["finish_reason"] == "length"
# else:
# assert choice["finish_reason"] == "stop"
else: else:
assert choice["finish_reason"] is None assert choice["finish_reason"] is None
content += choice["delta"]["content"] content += choice["delta"]["content"]
@ -93,7 +88,7 @@ def test_chat_completion_with_openai_library():
temperature=0.8, temperature=0.8,
) )
print(res) print(res)
assert res.choices[0].finish_reason == "stop" assert res.choices[0].finish_reason == "length"
assert res.choices[0].message.content is not None assert res.choices[0].message.content is not None
assert match_regex("(Suddenly)+", res.choices[0].message.content) assert match_regex("(Suddenly)+", res.choices[0].message.content)
@ -146,3 +141,20 @@ def test_invalid_chat_completion_req(messages):
}) })
assert res.status_code == 400 or res.status_code == 500 assert res.status_code == 400 or res.status_code == 500
assert "error" in res.body assert "error" in res.body
def test_chat_completion_with_timings_per_token():
global server
server.start()
res = server.make_stream_request("POST", "/chat/completions", data={
"max_tokens": 10,
"messages": [{"role": "user", "content": "test"}],
"stream": True,
"timings_per_token": True,
})
for data in res:
assert "timings" in data
assert "prompt_per_second" in data["timings"]
assert "predicted_per_second" in data["timings"]
assert "predicted_n" in data["timings"]
assert data["timings"]["predicted_n"] <= 10

View file

@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
content += data["content"] content += data["content"]
def test_completion_stream_vs_non_stream():
global server
server.start()
res_stream = server.make_stream_request("POST", "/completion", data={
"n_predict": 8,
"prompt": "I believe the meaning of life is",
"stream": True,
})
res_non_stream = server.make_request("POST", "/completion", data={
"n_predict": 8,
"prompt": "I believe the meaning of life is",
})
content_stream = ""
for data in res_stream:
content_stream += data["content"]
assert content_stream == res_non_stream.body["content"]
@pytest.mark.parametrize("n_slots", [1, 2]) @pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int): def test_consistent_result_same_seed(n_slots: int):
global server global server
@ -221,3 +239,24 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
assert len(res.body["content"]) > 10 assert len(res.body["content"]) > 10
# FIXME: the result is not deterministic when using other slot than slot 0 # FIXME: the result is not deterministic when using other slot than slot 0
# assert match_regex(re_content, res.body["content"]) # assert match_regex(re_content, res.body["content"])
def test_n_probs():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"n_probs": 10,
"temperature": 0.0,
"n_predict": 5,
})
assert res.status_code == 200
assert "completion_probabilities" in res.body
assert len(res.body["completion_probabilities"]) == 5
for tok in res.body["completion_probabilities"]:
assert "probs" in tok
assert len(tok["probs"]) == 10
for prob in tok["probs"]:
assert "prob" in prob
assert "tok_str" in prob
assert 0.0 <= prob["prob"] <= 1.0

View file

@ -82,6 +82,37 @@ def test_different_draft_min_draft_max():
last_content = res.body["content"] last_content = res.body["content"]
def test_slot_ctx_not_exceeded():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
def test_with_ctx_shift():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"n_predict": 64,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
assert res.body["tokens_predicted"] == 64
assert res.body["truncated"] == True
@pytest.mark.parametrize("n_slots,n_requests", [ @pytest.mark.parametrize("n_slots,n_requests", [
(1, 2), (1, 2),
(2, 2), (2, 2),

View file

@ -20,6 +20,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <memory>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
enum error_type {
ERROR_TYPE_INVALID_REQUEST,
ERROR_TYPE_AUTHENTICATION,
ERROR_TYPE_SERVER,
ERROR_TYPE_NOT_FOUND,
ERROR_TYPE_PERMISSION,
ERROR_TYPE_UNAVAILABLE, // custom error
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
template <typename T> template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) { static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value // Fallback null to default value
@ -485,48 +475,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
return out; return out;
} }
struct completion_token_output {
llama_token tok;
std::string text_to_send;
struct token_prob {
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
};
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
json out = json::array();
for (const auto & prob : probs) {
json probs_for_token = json::array();
for (const auto & p : prob.probs) {
const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json {
{"tok_str", tok_str},
{"prob", p.prob},
});
}
const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json {
{"content", tok_str},
{"probs", probs_for_token},
});
}
return out;
}
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
const std::string str = const std::string str =
std::string(event) + ": " + std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) + data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain) "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
LOG_DBG("data stream, to_send: %s", str.c_str()); LOG_DBG("data stream, to_send: %s", str.c_str());
@ -604,155 +557,6 @@ static json oaicompat_completion_params_parse(
return llama_params; return llama_params;
} }
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res = json {
{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}},
{"id", completion_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json {
{"choices", choices},
{"created", t},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}
};
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
ret.push_back({"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}});
}
return std::vector<json>({ret});
}
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array(); json data = json::array();
int i = 0; int i = 0;
@ -844,43 +648,3 @@ static json format_detokenized_response(const std::string & content) {
{"content", content} {"content", content}
}; };
} }
static json format_error_response(const std::string & message, const enum error_type type) {
std::string type_str;
int code = 500;
switch (type) {
case ERROR_TYPE_INVALID_REQUEST:
type_str = "invalid_request_error";
code = 400;
break;
case ERROR_TYPE_AUTHENTICATION:
type_str = "authentication_error";
code = 401;
break;
case ERROR_TYPE_NOT_FOUND:
type_str = "not_found_error";
code = 404;
break;
case ERROR_TYPE_SERVER:
type_str = "server_error";
code = 500;
break;
case ERROR_TYPE_PERMISSION:
type_str = "permission_error";
code = 403;
break;
case ERROR_TYPE_NOT_SUPPORTED:
type_str = "not_supported_error";
code = 501;
break;
case ERROR_TYPE_UNAVAILABLE:
type_str = "unavailable_error";
code = 503;
break;
}
return json {
{"code", code},
{"message", message},
{"type", type_str},
};
}

View file

@ -0,0 +1,268 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<meta name="color-scheme" content="light dark">
<title>🦙 llama.cpp - chat</title>
</head>
<body>
<div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
<div class="flex flex-row drawer lg:drawer-open">
<input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
<!-- sidebar -->
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
<div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
<div class="flex flex-row items-center justify-between mb-4 mt-4">
<h2 class="font-bold ml-4">Conversations</h2>
<!-- close sidebar button -->
<label for="toggle-drawer" class="btn btn-ghost lg:hidden">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
</svg>
</label>
</div>
<!-- list of conversations -->
<div :class="{
'btn btn-ghost justify-start': true,
'btn-active': messages.length === 0,
}" @click="newConversation">
+ New conversation
</div>
<div v-for="conv in conversations" :class="{
'btn btn-ghost justify-start font-normal': true,
'btn-active': conv.id === viewingConvId,
}" @click="setViewingConv(conv.id)">
<span class="truncate">{{ conv.messages[0].content }}</span>
</div>
<div class="text-center text-xs opacity-40 mt-auto mx-4">
Conversations are saved to browser's localStorage
</div>
</div>
</div>
<!-- main view -->
<div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
<!-- header -->
<div class="flex flex-row items-center mt-6 mb-6">
<!-- open sidebar button -->
<label for="toggle-drawer" class="btn btn-ghost lg:hidden">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
</svg>
</label>
<div class="grow text-2xl font-bold ml-2">llama.cpp</div>
<!-- action buttons (top right) -->
<div class="flex items-center">
<div v-if="messages.length > 0" class="dropdown dropdown-end">
<!-- "more" button -->
<button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
</svg>
</button>
<!-- "more" dropdown menu -->
<ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
<li @click="downloadConv(viewingConvId)"><a>Download</a></li>
<li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
</ul>
</div>
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
<!-- settings button -->
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
</svg>
</button>
<!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
<div class="dropdown dropdown-end dropdown-bottom">
<div tabindex="0" role="button" class="btn m-1">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
</svg>
</div>
<ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
<li>
<button
class="btn btn-sm btn-block btn-ghost justify-start"
:class="{ 'btn-active': selectedTheme === 'auto' }"
@click="setSelectedTheme('auto')">
auto
</button>
</li>
<li v-for="theme in themes">
<input
type="radio"
name="theme-dropdown"
class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
:aria-label="theme"
:value="theme"
:checked="selectedTheme === theme"
@click="setSelectedTheme(theme)" />
</li>
</ul>
</div>
</div>
</div>
<!-- chat messages -->
<div id="messages-list" class="flex flex-col grow overflow-y-auto">
<div class="mt-auto flex justify-center">
<!-- placeholder to shift the message to the bottom -->
{{ messages.length === 0 ? 'Send a message to start' : '' }}
</div>
<div v-for="msg in messages" class="group">
<div :class="{
'chat': true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
}">
<div :class="{
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
}">
<!-- textarea for editing message -->
<template v-if="editingMsg && editingMsg.id === msg.id">
<textarea
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="msg.content"></textarea>
<br/>
<button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
<button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
</template>
<!-- render message as markdown -->
<vue-markdown v-else :source="msg.content" />
</div>
</div>
<!-- actions for each message -->
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
<!-- user message -->
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
✍️ Edit
</button>
<!-- assistant message -->
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
🔄 Regenerate
</button>
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
📋 Copy
</button>
</div>
</div>
<!-- pending (ongoing) assistant message -->
<div id="pending-msg" class="chat chat-start">
<div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
<span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
<vue-markdown v-else :source="pendingMsg.content" />
</div>
</div>
</div>
<!-- chat input -->
<div class="flex flex-row items-center mt-8 mb-6">
<textarea
class="textarea textarea-bordered w-full"
placeholder="Type a message (Shift+Enter to add a new line)"
v-model="inputMsg"
@keydown.enter.exact.prevent="sendMessage"
@keydown.enter.shift.exact.prevent="inputMsg += '\n'"
:disabled="isGenerating"
id="msg-input"
></textarea>
<button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
<button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
</div>
</div>
</div>
<!-- modal for editing config -->
<dialog class="modal" :class="{'modal-open': showConfigDialog}">
<div class="modal-box">
<h3 class="text-lg font-bold mb-6">Settings</h3>
<div class="h-[calc(90vh-12rem)] overflow-y-auto">
<p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
<settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
<label class="form-control mb-2">
<div class="label">System Message</div>
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
</label>
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
<!-- Section: Other sampler settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Other sampler settings</summary>
<div class="collapse-content">
<!-- Samplers queue -->
<settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
<!-- Samplers -->
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
</div>
</details>
<!-- Section: Penalties settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Penalties settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
</template>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content">
<label class="form-control mb-2">
<!-- Custom parameters input -->
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
</label>
</div>
</details>
</div>
<!-- action buttons -->
<div class="modal-action">
<button class="btn" @click="resetConfigDialog">Reset to default</button>
<button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
<button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
</div>
</div>
</dialog>
</div>
<!-- Template to be used by settings modal -->
<template id="settings-modal-short-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
<!-- Show help message on hovering on the input label -->
<div class="dropdown dropdown-hover">
<div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{{ configInfo[configKey] || '(no help message available)' }}
</div>
</div>
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
</label>
</template>
<script type="module" src="/src/main.js"></script>
</body>
</html>

2783
examples/server/webui/package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
{
"name": "webui",
"private": true,
"version": "0.0.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "vite build",
"preview": "vite preview"
},
"devDependencies": {
"vite": "^5.4.10"
},
"dependencies": {
"autoprefixer": "^10.4.20",
"daisyui": "^4.12.14",
"markdown-it": "^14.1.0",
"postcss": "^8.4.49",
"tailwindcss": "^3.4.15",
"vite-plugin-singlefile": "^2.0.3",
"vue": "^3.5.13"
}
}

View file

@ -0,0 +1,6 @@
export default {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
}

View file

@ -0,0 +1,456 @@
import './styles.css';
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
import { llama } from './completion.js';
import MarkdownIt from 'markdown-it';
// utility functions
const isString = (x) => !!x.toLowerCase;
const isNumeric = (n) => !isString(n) && !isNaN(n);
const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
const copyStr = (str) => navigator.clipboard.writeText(str);
// constants
const BASE_URL = localStorage.getItem('base') // for debugging
|| (new URL('.', document.baseURI).href).toString(); // for production
const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
// make sure these default values are in sync with `common.h`
samplers: 'dkypmxt',
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typical_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
custom: '', // custom json-stringified object
};
const CONFIG_INFO = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
top_k: 'Keeps only k top tokens.',
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
max_tokens: 'The maximum number of token per output.',
custom: '', // custom json-stringified object
};
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui
const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
// markdown support
const VueMarkdown = defineComponent(
(props) => {
const md = shallowRef(new MarkdownIt({ breaks: true }));
const origFenchRenderer = md.value.renderer.rules.fence;
md.value.renderer.rules.fence = (tokens, idx, ...args) => {
const content = tokens[idx].content;
const origRendered = origFenchRenderer(tokens, idx, ...args);
return `<div class="relative my-4">
<div class="text-right sticky top-4 mb-2 mr-2 h-0">
<button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
</div>
${origRendered}
</div>`;
};
window.copyStr = copyStr;
const content = computed(() => md.value.render(props.source));
return () => h("div", { innerHTML: content.value });
},
{ props: ["source"] }
);
// input field to be used by settings modal
const SettingsModalShortInput = defineComponent({
template: document.getElementById('settings-modal-short-input').innerHTML,
props: {
label: { type: String, required: false },
configKey: String,
configDefault: Object,
configInfo: Object,
modelValue: [Object, String, Number],
},
});
// coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
const StorageUtils = {
// manage conversations
getAllConversations() {
const res = [];
for (const key in localStorage) {
if (key.startsWith('conv-')) {
res.push(JSON.parse(localStorage.getItem(key)));
}
}
res.sort((a, b) => b.lastModified - a.lastModified);
return res;
},
// can return null if convId does not exist
getOneConversation(convId) {
return JSON.parse(localStorage.getItem(convId) || 'null');
},
// if convId does not exist, create one
appendMsg(convId, msg) {
if (msg.content === null) return;
const conv = StorageUtils.getOneConversation(convId) || {
id: convId,
lastModified: Date.now(),
messages: [],
};
conv.messages.push(msg);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
getNewConvId() {
return `conv-${Date.now()}`;
},
remove(convId) {
localStorage.removeItem(convId);
},
filterAndKeepMsgs(convId, predicate) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
conv.messages = conv.messages.filter(predicate);
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
popMsg(convId) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
const msg = conv.messages.pop();
conv.lastModified = Date.now();
if (conv.messages.length === 0) {
StorageUtils.remove(convId);
} else {
localStorage.setItem(convId, JSON.stringify(conv));
}
return msg;
},
// manage config
getConfig() {
const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
// to prevent breaking changes in the future, we always provide default value for missing keys
return {
...CONFIG_DEFAULT,
...savedVal,
};
},
setConfig(config) {
localStorage.setItem('config', JSON.stringify(config));
},
getTheme() {
return localStorage.getItem('theme') || 'auto';
},
setTheme(theme) {
if (theme === 'auto') {
localStorage.removeItem('theme');
} else {
localStorage.setItem('theme', theme);
}
},
};
// scroll to bottom of chat messages
// if requiresNearBottom is true, only auto-scroll if user is near bottom
const chatScrollToBottom = (requiresNearBottom) => {
const msgListElem = document.getElementById('messages-list');
const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
if (!requiresNearBottom || (spaceToBottom < 100)) {
setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
}
};
const mainApp = createApp({
components: {
VueMarkdown,
SettingsModalShortInput,
},
data() {
return {
conversations: StorageUtils.getAllConversations(),
messages: [], // { id: number, role: 'user' | 'assistant', content: string }
viewingConvId: StorageUtils.getNewConvId(),
inputMsg: '',
isGenerating: false,
pendingMsg: null, // the on-going message from assistant
stopGeneration: () => {},
selectedTheme: StorageUtils.getTheme(),
config: StorageUtils.getConfig(),
showConfigDialog: false,
editingMsg: null,
// const
themes: THEMES,
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
}
},
computed: {},
mounted() {
document.getElementById('app').classList.remove('opacity-0'); // show app
// scroll to the bottom when the pending message height is updated
const pendingMsgElem = document.getElementById('pending-msg');
const resizeObserver = new ResizeObserver(() => {
if (this.isGenerating) chatScrollToBottom(true);
});
resizeObserver.observe(pendingMsgElem);
},
methods: {
hideSidebar() {
document.getElementById('toggle-drawer').checked = false;
},
setSelectedTheme(theme) {
this.selectedTheme = theme;
StorageUtils.setTheme(theme);
},
newConversation() {
if (this.isGenerating) return;
this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
},
setViewingConv(convId) {
if (this.isGenerating) return;
this.viewingConvId = convId;
this.editingMsg = null;
this.fetchMessages();
chatScrollToBottom();
this.hideSidebar();
},
deleteConv(convId) {
if (this.isGenerating) return;
if (window.confirm('Are you sure to delete this conversation?')) {
StorageUtils.remove(convId);
if (this.viewingConvId === convId) {
this.viewingConvId = StorageUtils.getNewConvId();
this.editingMsg = null;
}
this.fetchConversation();
this.fetchMessages();
}
},
downloadConv(convId) {
const conversation = StorageUtils.getOneConversation(convId);
if (!conversation) {
alert('Conversation not found.');
return;
}
const conversationJson = JSON.stringify(conversation, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
async sendMessage() {
if (!this.inputMsg) return;
const currConvId = this.viewingConvId;
StorageUtils.appendMsg(currConvId, {
id: Date.now(),
role: 'user',
content: this.inputMsg,
});
this.fetchConversation();
this.fetchMessages();
this.inputMsg = '';
this.editingMsg = null;
this.generateMessage(currConvId);
chatScrollToBottom();
},
async generateMessage(currConvId) {
if (this.isGenerating) return;
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
this.isGenerating = true;
this.editingMsg = null;
try {
const abortController = new AbortController();
this.stopGeneration = () => abortController.abort();
const params = {
messages: [
{ role: 'system', content: this.config.systemMessage },
...this.messages,
],
stream: true,
cache_prompt: true,
samplers: this.config.samplers,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
top_k: this.config.top_k,
top_p: this.config.top_p,
min_p: this.config.min_p,
typical_p: this.config.typical_p,
xtc_probability: this.config.xtc_probability,
xtc_threshold: this.config.xtc_threshold,
repeat_last_n: this.config.repeat_last_n,
repeat_penalty: this.config.repeat_penalty,
presence_penalty: this.config.presence_penalty,
frequency_penalty: this.config.frequency_penalty,
dry_multiplier: this.config.dry_multiplier,
dry_base: this.config.dry_base,
dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
};
const config = {
controller: abortController,
api_url: BASE_URL,
endpoint: '/chat/completions',
};
for await (const chunk of llama(prompt, params, config)) {
const stop = chunk.data.stop;
const addedContent = chunk.data.choices[0].delta.content;
const lastContent = this.pendingMsg.content || '';
if (addedContent) {
this.pendingMsg = {
id: this.pendingMsg.id,
role: 'assistant',
content: lastContent + addedContent,
};
}
}
StorageUtils.appendMsg(currConvId, this.pendingMsg);
this.fetchConversation();
this.fetchMessages();
setTimeout(() => document.getElementById('msg-input').focus(), 1);
} catch (error) {
if (error.name === 'AbortError') {
// user stopped the generation via stopGeneration() function
StorageUtils.appendMsg(currConvId, this.pendingMsg);
this.fetchConversation();
this.fetchMessages();
} else {
console.error(error);
alert(error);
// pop last user message
const lastUserMsg = StorageUtils.popMsg(currConvId);
this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
}
}
this.pendingMsg = null;
this.isGenerating = false;
this.stopGeneration = () => {};
this.fetchMessages();
chatScrollToBottom();
},
// message actions
regenerateMsg(msg) {
if (this.isGenerating) return;
// TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
const currConvId = this.viewingConvId;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
this.fetchConversation();
this.fetchMessages();
this.generateMessage(currConvId);
},
copyMsg(msg) {
copyStr(msg.content);
},
editUserMsgAndRegenerate(msg) {
if (this.isGenerating) return;
const currConvId = this.viewingConvId;
const newContent = msg.content;
this.editingMsg = null;
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
StorageUtils.appendMsg(currConvId, {
id: Date.now(),
role: 'user',
content: newContent,
});
this.fetchConversation();
this.fetchMessages();
this.generateMessage(currConvId);
},
// settings dialog methods
closeAndSaveConfigDialog() {
try {
if (this.config.custom.length) JSON.parse(this.config.custom);
} catch (error) {
alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
return;
}
for (const key of CONFIG_NUMERIC_KEYS) {
if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
alert(`Invalid number for ${key} (expected an integer or a float)`);
return;
}
this.config[key] = parseFloat(this.config[key]);
}
this.showConfigDialog = false;
StorageUtils.setConfig(this.config);
},
closeAndDiscardConfigDialog() {
this.showConfigDialog = false;
this.config = StorageUtils.getConfig();
},
resetConfigDialog() {
if (window.confirm('Are you sure to reset all settings?')) {
this.config = {...CONFIG_DEFAULT};
}
},
// sync state functions
fetchConversation() {
this.conversations = StorageUtils.getAllConversations();
},
fetchMessages() {
this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
},
},
});
mainApp.config.errorHandler = alert;
try {
mainApp.mount('#app');
} catch (err) {
console.error(err);
document.getElementById('app').innerHTML = `<div style="margin:2em auto">
Failed to start app. Please try clearing localStorage and try again.<br/>
<br/>
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
</div>`;
}

View file

@ -0,0 +1,26 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
.markdown {
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
pre {
@apply whitespace-pre-wrap rounded-lg p-2;
border: 1px solid currentColor;
}
/* TODO: fix markdown table */
}
.show-on-hover {
@apply md:opacity-0 md:group-hover:opacity-100;
}
.btn-mini {
@apply cursor-pointer hover:shadow-md;
}
.chat-screen { max-width: 900px; }
.chat-bubble-base-300 {
--tw-bg-opacity: 1;
--tw-text-opacity: 1;
@apply bg-base-300 text-base-content;
}

View file

@ -0,0 +1,16 @@
/** @type {import('tailwindcss').Config} */
export default {
content: [
"./index.html",
"./src/**/*.{js,ts,jsx,tsx}",
],
theme: {
extend: {},
},
plugins: [
require('daisyui'),
],
daisyui: {
themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'],
}
}

View file

@ -0,0 +1,36 @@
import { viteSingleFile } from 'vite-plugin-singlefile';
import path from 'path';
import fs from 'fs';
const GUIDE_FOR_FRONTEND = `
<!--
This is a single file build of the frontend.
It is automatically generated by the build process.
Do not edit this file directly.
To make changes, refer to the "Web UI" section in the README.
-->
`.trim();
export default {
plugins: [
viteSingleFile(),
(function llamaCppPlugin() {
let config;
return {
name: 'llamacpp:build',
apply: 'build',
async configResolved(_config) {
config = _config;
},
writeBundle() {
const outputIndexHtml = path.join(config.build.outDir, 'index.html');
const content = fs.readFileSync(outputIndexHtml, 'utf-8');
const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
}
}
})(),
],
};

View file

@ -505,6 +505,7 @@ extern "C" {
GGML_OP_POOL_2D_BACK, GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ARANGE, GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
@ -1701,6 +1702,13 @@ extern "C" {
int p2, int p2,
int p3); int p3);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,] // timesteps: [N,]
// return: [N, dim] // return: [N, dim]

View file

@ -211,7 +211,12 @@ extern "C" {
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend // Add backend dynamic loading support to the backend
// Initialize the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
// Optional: obtain a score for the backend based on the system configuration
// Higher scores are preferred, 0 means the backend is not supported in the current system
typedef int (*ggml_backend_score_t)(void);
#ifdef GGML_BACKEND_DL #ifdef GGML_BACKEND_DL
# ifdef __cplusplus # ifdef __cplusplus
@ -222,15 +227,28 @@ extern "C" {
ggml_backend_reg_t ggml_backend_init(void) { \ ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \ return reg_fn(); \
} }
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
extern "C" { \
GGML_BACKEND_API int ggml_backend_score(void); \
} \
int ggml_backend_score(void) { \
return score_fn(); \
}
# else # else
# define GGML_BACKEND_DL_IMPL(reg_fn) \ # define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \ ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \ return reg_fn(); \
} }
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
GGML_BACKEND_API int ggml_backend_score(void); \
int ggml_backend_score(void) { \
return score_fn(); \
}
# endif # endif
#else #else
# define GGML_BACKEND_DL_IMPL(reg_fn) # define GGML_BACKEND_DL_IMPL(reg_fn)
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -2,8 +2,13 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml-impl.h" #include "ggml-impl.h"
#include <algorithm> #include <algorithm>
#include <codecvt>
#include <cstring> #include <cstring>
#include <filesystem>
#include <locale>
#include <memory>
#include <string> #include <string>
#include <type_traits>
#include <vector> #include <vector>
#ifdef _WIN32 #ifdef _WIN32
@ -57,9 +62,71 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
#ifdef _WIN32
using dl_handle = typename std::remove_pointer<HMODULE>::type;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static dl_handle * dl_load_library(const std::wstring & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.c_str());
SetErrorMode(old_mode);
return handle;
}
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
struct ggml_backend_reg_entry { struct ggml_backend_reg_entry {
ggml_backend_reg_t reg; ggml_backend_reg_t reg;
void * handle; dl_handle_ptr handle;
}; };
struct ggml_backend_registry { struct ggml_backend_registry {
@ -97,13 +164,16 @@ struct ggml_backend_registry {
} }
~ggml_backend_registry() { ~ggml_backend_registry() {
while (!backends.empty()) { // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
// use silent since the log system may have been destroyed at this point // since backend threads may still be running and accessing resources from the dynamic library
unload_backend(backends.back().reg, true); for (auto & entry : backends) {
if (entry.handle) {
entry.handle.release(); // NOLINT
}
} }
} }
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
if (!reg) { if (!reg) {
return; return;
} }
@ -112,7 +182,7 @@ struct ggml_backend_registry {
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif #endif
backends.push_back({ reg, handle }); backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i)); register_device(ggml_backend_reg_dev_get(reg, i));
} }
@ -126,54 +196,31 @@ struct ggml_backend_registry {
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) { ggml_backend_reg_t load_backend(const char * path, bool silent) {
#ifdef _WIN32 dl_handle_ptr handle { dl_load_library(path) };
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryA(path);
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
} }
SetErrorMode(old_mode);
return nullptr; return nullptr;
} }
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) {
SetErrorMode(old_mode);
if (!backend_init) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
} }
FreeLibrary(handle);
return nullptr; return nullptr;
} }
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) { auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
} }
return nullptr; return nullptr;
} }
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); ggml_backend_reg_t reg = backend_init_fn();
if (!backend_init) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
}
dlclose(handle);
return nullptr;
}
#endif
ggml_backend_reg_t reg = backend_init();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) { if (!silent) {
if (!reg) { if (!reg) {
@ -183,22 +230,19 @@ struct ggml_backend_registry {
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION); __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
} }
} }
#ifdef _WIN32
FreeLibrary(handle);
#else
dlclose(handle);
#endif
return nullptr; return nullptr;
} }
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, handle);
register_backend(reg, std::move(handle));
return reg; return reg;
} }
void unload_backend(ggml_backend_reg_t reg, bool silent) { void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(), auto it = std::find_if(backends.begin(), backends.end(),
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
if (it == backends.end()) { if (it == backends.end()) {
if (!silent) { if (!silent) {
@ -217,15 +261,6 @@ struct ggml_backend_registry {
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end()); devices.end());
// unload library
if (it->handle) {
#ifdef _WIN32
FreeLibrary((HMODULE) it->handle);
#else
dlclose(it->handle);
#endif
}
// remove backend // remove backend
backends.erase(it); backends.erase(it);
} }
@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true); get_reg().unload_backend(reg, true);
} }
void ggml_backend_load_all() { static std::string get_executable_path() {
std::vector<std::string> search_prefix;
// add the executable directory to the search path
// FIXME: this is convenient for development, but it should probably be disabled in production
#if defined(__APPLE__) #if defined(__APPLE__)
// get executable path // get executable path
std::vector<char> path; std::vector<char> path;
@ -364,7 +394,7 @@ void ggml_backend_load_all() {
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
search_prefix.push_back(base_path + "/"); return base_path + "/";
#elif defined(__linux__) #elif defined(__linux__)
std::string base_path = "."; std::string base_path = ".";
std::vector<char> path(1024); std::vector<char> path(1024);
@ -386,38 +416,117 @@ void ggml_backend_load_all() {
path.resize(path.size() * 2); path.resize(path.size() * 2);
} }
search_prefix.push_back(base_path + "/"); return base_path + "/";
#elif defined(_WIN32)
std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
if (len == 0) {
return "";
}
std::string base_path(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "\\";
#endif #endif
}
auto & reg = get_reg(); static std::string backend_filename_prefix() {
auto try_load = [&](const std::string & name) {
std::string os_name;
#ifdef _WIN32 #ifdef _WIN32
os_name = "ggml-" + name + ".dll"; return "ggml-";
#else #else
os_name = "libggml-" + name + ".so"; return "libggml-";
#endif #endif
if (reg.load_backend(os_name.c_str(), true)) {
return;
} }
for (const auto & prefix : search_prefix) {
if (reg.load_backend((prefix + os_name).c_str(), true)) {
return;
}
}
};
try_load("amx"); static std::string backend_filename_suffix() {
try_load("blas"); #ifdef _WIN32
try_load("cann"); return ".dll";
try_load("cuda"); #else
try_load("hip"); return ".so";
try_load("kompute"); #endif
try_load("metal"); }
try_load("rpc");
try_load("sycl"); static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
try_load("vulkan"); //not available as we don't want c++17
try_load("musa"); printf("\nggml_backend_load_best NOT AVAILABLE!\n");
try_load("cpu"); return nullptr;
// // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// // TODO: search system paths
// std::vector<std::string> search_paths = { "./", get_executable_path() };
// std::string file_prefix = backend_filename_prefix() + name + "-";
// int best_score = 0;
// std::string best_path;
// namespace fs = std::filesystem;
// for (const auto & search_path : search_paths) {
// if (!fs::exists(search_path)) {
// continue;
// }
// for (const auto & entry : fs::directory_iterator(search_path)) {
// if (entry.is_regular_file()) {
// std::string filename = entry.path().filename().string();
// std::string ext = entry.path().extension().string();
// if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
// dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
// if (!handle && !silent) {
// GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
// }
// if (handle) {
// auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
// if (score_fn) {
// int s = score_fn();
// #ifndef NDEBUG
// GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
// #endif
// if (s > best_score) {
// best_score = s;
// best_path = entry.path().string();
// }
// } else {
// if (!silent) {
// GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
// }
// }
// }
// }
// }
// }
// }
// if (best_score == 0) {
// // try to load the base backend
// for (const auto & search_path : search_paths) {
// std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
// if (fs::exists(path)) {
// return get_reg().load_backend(path.c_str(), silent);
// }
// }
// return nullptr;
// }
// return get_reg().load_backend(best_path.c_str(), silent);
}
void ggml_backend_load_all() {
#ifdef NDEBUG
bool silent = true;
#else
bool silent = false;
#endif
ggml_backend_load_best("blas", silent);
ggml_backend_load_best("cann", silent);
ggml_backend_load_best("cuda", silent);
ggml_backend_load_best("hip", silent);
ggml_backend_load_best("kompute", silent);
ggml_backend_load_best("metal", silent);
ggml_backend_load_best("rpc", silent);
ggml_backend_load_best("sycl", silent);
ggml_backend_load_best("vulkan", silent);
ggml_backend_load_best("musa", silent);
ggml_backend_load_best("cpu", silent);
} }

View file

@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f
int tbegin, tend; int tbegin, tend;
balance211(n, params->nth, params->ith, tbegin, tend); balance211(n, params->nth, params->ith, tbegin, tend);
f(tbegin, tend); f(tbegin, tend);
ggml_barrier(params->threadpool); // TODO: might not always be needed
} }
// quantized types that have AMX support // quantized types that have AMX support

View file

@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
__m512 vb[COLS]; __m512 vb[COLS];
__m512 vc[ROWS * COLS]; __m512 vc[ROWS * COLS];
auto loadc = [&](int idx) { auto loadc = [&](auto idx) {
vc[idx] = _mm512_setzero_ps(); vc[idx] = _mm512_setzero_ps();
}; };
Unroll<ROWS * COLS>{}(loadc); Unroll<ROWS * COLS>{}(loadc);
auto compute = [&](int idx, int k) { auto compute = [&](auto idx, auto k) {
// TODO: use `constexpr` here to get rid of interger div constexpr int row = idx / COLS;
// when upgraded to C++17 constexpr int col = idx % COLS;
const int row = idx / COLS;
const int col = idx % COLS;
if (col == 0) { if constexpr (col == 0) {
va = _mm512_loadu_ps(A + row * K + k); va = _mm512_loadu_ps(A + row * K + k);
} }
if (row == 0) { if constexpr (row == 0) {
vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); vb[col] = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k)));
} }
vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
Unroll<ROWS * COLS>{}(compute, k); Unroll<ROWS * COLS>{}(compute, k);
} }
auto storec = [&](int idx) { auto storec = [&](auto idx) {
const int row = idx / COLS; constexpr int row = idx / COLS;
const int col = idx % COLS; constexpr int col = idx % COLS;
C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]);
}; };
Unroll<ROWS * COLS>{}(storec); Unroll<ROWS * COLS>{}(storec);
@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
const __m512i off = _mm512_set1_epi8(8); const __m512i off = _mm512_set1_epi8(8);
const __m512i lowMask = _mm512_set1_epi8(0xF); const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
// load a and compute compensation // load a and compute compensation
if (col == 0) { if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
vcomp = _mm512_setzero_si512(); vcomp = _mm512_setzero_si512();
for (int k = 0; k < 8; ++k) { for (int k = 0; k < 8; ++k) {
@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);
@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
const __m512i lowMask = _mm512_set1_epi8(0xF); const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
// load a // load a
if (col == 0) { if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) { for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]); va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);
@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
// //
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
// load a and add offset 128 // load a and add offset 128
if (col == 0) { if constexpr (col == 0) {
const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs);
for (int k = 0; k < 8; ++k) { for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]); va[k] = _mm512_set1_epi32(a_ptr[k]);
@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);
@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF); const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
// int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8 // int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8
// from {16, 8} to {4, 32} // from {16, 8} to {4, 32}
// //
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
// load a // load a
if (col == 0) { if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) { for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
} }
@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);
@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i lowMask = _mm512_set1_epi8(0xF); const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
// Q5_K and Q4_K shares the same vnni formats, refer to notes above. // Q5_K and Q4_K shares the same vnni formats, refer to notes above.
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
// load a // load a
if (col == 0) { if constexpr (col == 0) {
for (int k_group = 0; k_group < QK_K / 32; ++k_group) { for (int k_group = 0; k_group < QK_K / 32; ++k_group) {
va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32)));
} }
@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);
@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO
const __m512i m32s = _mm512_set1_epi32(32); const __m512i m32s = _mm512_set1_epi32(32);
const __m512i lowMask = _mm512_set1_epi8(0xF); const __m512i lowMask = _mm512_set1_epi8(0xF);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
if (col == 0) { if constexpr (col == 0) {
// load a // load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
const __m512i values256 = _mm512_add_epi8(values128, off); const __m512i values256 = _mm512_add_epi8(values128, off);
auto loadc = [&](int col) { auto loadc = [&](auto col) {
vc[col] = _mm512_setzero_ps(); vc[col] = _mm512_setzero_ps();
}; };
Unroll<COLS>{}(loadc); Unroll<COLS>{}(loadc);
auto compute = [&](int col, int i) { auto compute = [&](auto col, auto i) {
if (col == 0) { if constexpr (col == 0) {
// load a // load a
va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0)); va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 0));
va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64)); va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs + 64));
@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
} }
//store to C //store to C
auto storec = [&](int col) { auto storec = [&](auto col) {
_mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]);
}; };
Unroll<COLS>{}(storec); Unroll<COLS>{}(storec);

View file

@ -0,0 +1,323 @@
#include "ggml-backend-impl.h"
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <cstring>
#include <vector>
#include <bitset>
#include <array>
#include <string>
// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
struct cpuid_x86 {
bool SSE3(void) { return f_1_ecx[0]; }
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
bool MONITOR(void) { return f_1_ecx[3]; }
bool SSSE3(void) { return f_1_ecx[9]; }
bool FMA(void) { return f_1_ecx[12]; }
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
bool SSE41(void) { return f_1_ecx[19]; }
bool SSE42(void) { return f_1_ecx[20]; }
bool MOVBE(void) { return f_1_ecx[22]; }
bool POPCNT(void) { return f_1_ecx[23]; }
bool AES(void) { return f_1_ecx[25]; }
bool XSAVE(void) { return f_1_ecx[26]; }
bool OSXSAVE(void) { return f_1_ecx[27]; }
bool AVX(void) { return f_1_ecx[28]; }
bool F16C(void) { return f_1_ecx[29]; }
bool RDRAND(void) { return f_1_ecx[30]; }
bool MSR(void) { return f_1_edx[5]; }
bool CX8(void) { return f_1_edx[8]; }
bool SEP(void) { return f_1_edx[11]; }
bool CMOV(void) { return f_1_edx[15]; }
bool CLFSH(void) { return f_1_edx[19]; }
bool MMX(void) { return f_1_edx[23]; }
bool FXSR(void) { return f_1_edx[24]; }
bool SSE(void) { return f_1_edx[25]; }
bool SSE2(void) { return f_1_edx[26]; }
bool FSGSBASE(void) { return f_7_ebx[0]; }
bool BMI1(void) { return f_7_ebx[3]; }
bool HLE(void) { return is_intel && f_7_ebx[4]; }
bool AVX2(void) { return f_7_ebx[5]; }
bool BMI2(void) { return f_7_ebx[8]; }
bool ERMS(void) { return f_7_ebx[9]; }
bool INVPCID(void) { return f_7_ebx[10]; }
bool RTM(void) { return is_intel && f_7_ebx[11]; }
bool AVX512F(void) { return f_7_ebx[16]; }
bool AVX512DQ(void) { return f_7_ebx[17]; }
bool RDSEED(void) { return f_7_ebx[18]; }
bool ADX(void) { return f_7_ebx[19]; }
bool AVX512PF(void) { return f_7_ebx[26]; }
bool AVX512ER(void) { return f_7_ebx[27]; }
bool AVX512CD(void) { return f_7_ebx[28]; }
bool AVX512BW(void) { return f_7_ebx[30]; }
bool AVX512VL(void) { return f_7_ebx[31]; }
bool SHA(void) { return f_7_ebx[29]; }
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
bool LAHF(void) { return f_81_ecx[0]; }
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
bool ABM(void) { return is_amd && f_81_ecx[5]; }
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
bool XOP(void) { return is_amd && f_81_ecx[11]; }
bool TBM(void) { return is_amd && f_81_ecx[21]; }
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
bool AVX512_FP16(void) { return f_7_edx[23]; }
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
bool AMX_TILE(void) { return f_7_edx[24]; }
bool AMX_INT8(void) { return f_7_edx[25]; }
bool AMX_FP16(void) { return f_7_1_eax[21]; }
bool AMX_BF16(void) { return f_7_edx[22]; }
#ifdef _MSC_VER
static void cpuid(int cpu_info[4], int eax) {
__cpuid(cpu_info, eax);
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__cpuidex(cpu_info, eax, ecx);
}
#else
static void cpuid(int cpu_info[4], int eax) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(0));
}
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__asm__ __volatile__(
"cpuid"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(ecx));
}
#endif
cpuid_x86() {
std::array<int, 4> cpui;
std::vector<std::array<int, 4>> data;
// calling __cpuid with 0x0 as the function_id argument
// gets the number of the highest valid function ID.
cpuid(cpui.data(), 0);
int n_ids = cpui[0];
for (int i = 0; i <= n_ids; ++i) {
cpuidex(cpui.data(), i, 0);
data.push_back(cpui);
}
// capture vendor string
char vendor[0x20] = {};
*reinterpret_cast<int *>(vendor) = data[0][1];
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
this->vendor = vendor;
if (this->vendor == "GenuineIntel") {
is_intel = true;
} else if (this->vendor == "AuthenticAMD") {
is_amd = true;
}
// load bitset with flags for function 0x00000001
if (n_ids >= 1) {
f_1_ecx = data[1][2];
f_1_edx = data[1][3];
}
// load bitset with flags for function 0x00000007
if (n_ids >= 7) {
f_7_ebx = data[7][1];
f_7_ecx = data[7][2];
f_7_edx = data[7][3];
cpuidex(cpui.data(), 7, 1);
f_7_1_eax = cpui[0];
}
// calling __cpuid with 0x80000000 as the function_id argument
// gets the number of the highest valid extended ID.
cpuid(cpui.data(), 0x80000000);
unsigned int n_ex_ids = cpui[0];
std::vector<std::array<int, 4>> ext_data;
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
cpuidex(cpui.data(), i, 0);
ext_data.push_back(cpui);
}
// load bitset with flags for function 0x80000001
if (n_ex_ids >= 0x80000001) {
f_81_ecx = ext_data[1][2];
f_81_edx = ext_data[1][3];
}
// interpret CPU brand string if reported
char brand[0x40] = {};
if (n_ex_ids >= 0x80000004) {
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
this->brand = brand;
}
}
bool is_intel = false;
bool is_amd = false;
std::string vendor;
std::string brand;
std::bitset<32> f_1_ecx;
std::bitset<32> f_1_edx;
std::bitset<32> f_7_ebx;
std::bitset<32> f_7_ecx;
std::bitset<32> f_7_edx;
std::bitset<32> f_7_1_eax;
std::bitset<32> f_81_ecx;
std::bitset<32> f_81_edx;
};
#if 0
void test_x86_is() {
cpuid_x86 is;
printf("CPU Vendor: %s\n", is.vendor.c_str());
printf("Brand: %s\n", is.brand.c_str());
printf("is_intel: %d\n", is.is_intel);
printf("is_amd: %d\n", is.is_amd);
printf("sse3: %d\n", is.SSE3());
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
printf("ssse3: %d\n", is.SSSE3());
printf("fma: %d\n", is.FMA());
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
printf("sse41: %d\n", is.SSE41());
printf("sse42: %d\n", is.SSE42());
printf("movbe: %d\n", is.MOVBE());
printf("popcnt: %d\n", is.POPCNT());
printf("aes: %d\n", is.AES());
printf("xsave: %d\n", is.XSAVE());
printf("osxsave: %d\n", is.OSXSAVE());
printf("avx: %d\n", is.AVX());
printf("f16c: %d\n", is.F16C());
printf("rdrand: %d\n", is.RDRAND());
printf("msr: %d\n", is.MSR());
printf("cx8: %d\n", is.CX8());
printf("sep: %d\n", is.SEP());
printf("cmov: %d\n", is.CMOV());
printf("clflush: %d\n", is.CLFSH());
printf("mmx: %d\n", is.MMX());
printf("fxsr: %d\n", is.FXSR());
printf("sse: %d\n", is.SSE());
printf("sse2: %d\n", is.SSE2());
printf("fsgsbase: %d\n", is.FSGSBASE());
printf("bmi1: %d\n", is.BMI1());
printf("hle: %d\n", is.HLE());
printf("avx2: %d\n", is.AVX2());
printf("bmi2: %d\n", is.BMI2());
printf("erms: %d\n", is.ERMS());
printf("invpcid: %d\n", is.INVPCID());
printf("rtm: %d\n", is.RTM());
printf("avx512f: %d\n", is.AVX512F());
printf("rdseed: %d\n", is.RDSEED());
printf("adx: %d\n", is.ADX());
printf("avx512pf: %d\n", is.AVX512PF());
printf("avx512er: %d\n", is.AVX512ER());
printf("avx512cd: %d\n", is.AVX512CD());
printf("sha: %d\n", is.SHA());
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
printf("lahf: %d\n", is.LAHF());
printf("lzcnt: %d\n", is.LZCNT());
printf("abm: %d\n", is.ABM());
printf("sse4a: %d\n", is.SSE4a());
printf("xop: %d\n", is.XOP());
printf("tbm: %d\n", is.TBM());
printf("syscall: %d\n", is.SYSCALL());
printf("mmxext: %d\n", is.MMXEXT());
printf("rdtscp: %d\n", is.RDTSCP());
printf("3dnowext: %d\n", is._3DNOWEXT());
printf("3dnow: %d\n", is._3DNOW());
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
printf("avx512_fp16: %d\n", is.AVX512_FP16());
printf("avx512_bf16: %d\n", is.AVX512_BF16());
printf("amx_tile: %d\n", is.AMX_TILE());
printf("amx_int8: %d\n", is.AMX_INT8());
printf("amx_fp16: %d\n", is.AMX_FP16());
printf("amx_bf16: %d\n", is.AMX_BF16());
}
#endif
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
int score = 0;
cpuid_x86 is;
#ifdef GGML_FMA
if (!is.FMA()) { return 0; }
score += 1;
#endif
#ifdef GGML_F16C
if (!is.F16C()) { return 0; }
score += 1<<1;
#endif
#ifdef GGML_SSE42
if (!is.SSE42()) { return 0; }
score += 1<<2;
#endif
#ifdef GGML_AVX
if (!is.AVX()) { return 0; }
score += 1<<4;
#endif
#ifdef GGML_AVX2
if (!is.AVX2()) { return 0; }
score += 1<<5;
#endif
#ifdef GGML_AVX_VNNI
if (!is.AVX_VNNI()) { return 0; }
score += 1<<6;
#endif
#ifdef GGML_AVX512
if (!is.AVX512F()) { return 0; }
if (!is.AVX512CD()) { return 0; }
if (!is.AVX512VL()) { return 0; }
if (!is.AVX512DQ()) { return 0; }
if (!is.AVX512BW()) { return 0; }
score += 1<<7;
#endif
#ifdef GGML_AVX512_VBMI
if (!is.AVX512_VBMI()) { return 0; }
score += 1<<8;
#endif
#ifdef GGML_AVX512_BF16
if (!is.AVX512_BF16()) { return 0; }
score += 1<<9;
#endif
#ifdef GGML_AVX512_VNNI
if (!is.AVX512_VNNI()) { return 0; }
score += 1<<10;
#endif
#ifdef GGML_AMX_INT8
if (!is.AMX_INT8()) { return 0; }
score += 1<<11;
#endif
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))

View file

@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
} }
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) #if defined(__AVX512VNNI__)
const __m512i zero = _mm512_setzero_si512(); const __m512i zero = _mm512_setzero_si512();
return _mm512_dpbusd_epi32(zero, ax, sy); return _mm512_dpbusd_epi32(zero, ax, sy);
#else #else
@ -525,67 +525,47 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
const void * a_ptr = vy;
float * res_ptr = s;
__asm__ __volatile__( for (int c = 0; c < nc; c += ncols_interleaved) {
"movi v31.16b, #0x4\n" const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
"movi v30.16b, #0xf0\n" float32x4_t acc = vdupq_n_f32(0);
"add %x[b_ptr], %x[b_ptr], #0x8\n" for (int b = 0; b < nb; b++) {
"1:" // Column loop int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
"add x22, %x[a_ptr], #0x2\n" int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
"movi v29.16b, #0x0\n" int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
"mov x21, %x[nb]\n" int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
"2:" // Block loop float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
"ldr q28, [%x[b_ptr], #0x0]\n"
"ldr q27, [x22, #0x0]\n" int8x16_t a0 = vld1q_s8(a_ptr->qs);
"movi v26.4s, #0x0\n" int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
"sub x20, x22, #0x2\n" float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
"ldr q25, [x22, #0x10]\n"
"ldr q24, [%x[b_ptr], #0x10]\n" int32x4_t ret = vdupq_n_s32(0);
"sub x21, x21, #0x1\n"
"add x22, x22, #0x22\n" ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
"ldr q23, [%x[b_ptr], #0x20]\n" ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
"ldr q22, [%x[b_ptr], #0x30]\n" ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
"ld1r { v21.8h }, [x20]\n" ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
"ldr q20, [%x[b_ptr], #-0x8]\n"
"sshl v16.16b, v28.16b, v31.16b\n" ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
"and v28.16b, v28.16b, v30.16b\n" ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
"sshl v19.16b, v24.16b, v31.16b\n" ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
"and v24.16b, v24.16b, v30.16b\n" ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
"add %x[b_ptr], %x[b_ptr], #0x48\n"
"sshl v18.16b, v23.16b, v31.16b\n" acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
"and v23.16b, v23.16b, v30.16b\n" vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n" a_ptr++;
"sshl v17.16b, v22.16b, v31.16b\n" b_ptr++;
"and v22.16b, v22.16b, v30.16b\n" }
"fcvtl v21.4s, v21.4h\n" vst1q_f32(s, acc);
"fcvtl v16.4s, v20.4h\n" s += ncols_interleaved;
".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n" }
"fmul v16.4s, v16.4s, v21.4s\n"
".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
"scvtf v26.4s, v26.4s, #0x4\n"
"fmla v29.4s, v26.4s, v16.4s\n"
"cbnz x21, 2b\n"
"sub %x[nc], %x[nc], #0x4\n"
"str q29, [%x[res_ptr], #0x0]\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"cbnz %x[nc], 1b\n"
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
: "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
);
return; return;
} }
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
float sumf[4]; float sumf[4];
int sumi; int sumi;

View file

@ -759,7 +759,7 @@ do { \
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
#else #else
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8]; float tmp[8];
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
@ -1377,7 +1377,10 @@ struct ggml_compute_state {
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
@ -2429,7 +2432,7 @@ bool ggml_is_numa(void) {
#endif #endif
#if !defined(HWCAP2_I8MM) #if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM 0 #define HWCAP2_I8MM (1 << 13)
#endif #endif
static void ggml_init_arm_arch_features(void) { static void ggml_init_arm_arch_features(void) {
@ -8284,6 +8287,77 @@ static void ggml_compute_forward_set_f32(
} }
} }
static void ggml_compute_forward_set_i32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
// view src0 and dst with these strides and data offset inbytes during set
// nb0 is implicitly element_size because src0 and dst are contiguous
size_t nb1 = ((int32_t *) dst->op_params)[0];
size_t nb2 = ((int32_t *) dst->op_params)[1];
size_t nb3 = ((int32_t *) dst->op_params)[2];
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
if (params->ith == 0) {
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
((char *) dst->data),
((char *) src0->data),
ggml_nbytes(dst));
}
ggml_barrier(params->threadpool);
}
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src1);
const int nc = src1->ne[0];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
// src0 and dst as viewed during set
const size_t nb0 = ggml_element_size(src0);
const int im0 = (ne10 == 0 ? 0 : ne10-1);
const int im1 = (ne11 == 0 ? 0 : ne11-1);
const int im2 = (ne12 == 0 ? 0 : ne12-1);
const int im3 = (ne13 == 0 ? 0 : ne13-1);
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
GGML_ASSERT(nb10 == sizeof(int32_t));
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 and dst are viewed with shape of src1 and offset
// => same indices
const int i3 = ir/(ne12*ne11);
const int i2 = (ir - i3*ne12*ne11)/ne11;
const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
ggml_vec_cpy_i32(nc,
(int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
(int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
}
}
static void ggml_compute_forward_set( static void ggml_compute_forward_set(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -8295,6 +8369,10 @@ static void ggml_compute_forward_set(
{ {
ggml_compute_forward_set_f32(params, dst); ggml_compute_forward_set_f32(params, dst);
} break; } break;
case GGML_TYPE_I32:
{
ggml_compute_forward_set_i32(params, dst);
} break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_BF16: case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -10475,6 +10553,40 @@ static void ggml_compute_forward_pad(
} }
} }
// ggml_compute_forward_pad_reflect_1d
static void ggml_compute_forward_pad_reflect_1d(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int ith = params->ith;
const int nth = params->nth;
const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
GGML_TENSOR_UNARY_OP_LOCALS
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
}
}
}
}
// ggml_compute_forward_arange // ggml_compute_forward_arange
@ -12571,6 +12683,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad(params, tensor); ggml_compute_forward_pad(params, tensor);
} break; } break;
case GGML_OP_PAD_REFLECT_1D:
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
{ {
ggml_compute_forward_arange(params, tensor); ggml_compute_forward_arange(params, tensor);
@ -12913,6 +13029,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:

View file

@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_llamafile()) { if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" }); features.push_back({ "LLAMAFILE", "1" });
} }
// TODO: rename this #ifdef GGML_USE_ACCELERATE
features.push_back({ "ACCELERATE", "1" });
#endif
#ifdef GGML_USE_CPU_HBM
features.push_back({ "CPU_HBM", "1" });
#endif
#ifdef GGML_USE_OPENMP
features.push_back({ "OPENMP", "1" });
#endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" }); features.push_back({ "AARCH64_REPACK", "1" });
#endif #endif

View file

@ -220,7 +220,6 @@ static __global__ void flash_attn_vec_ext_f16(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j; kqmax_shared[j][threadIdx.y] = kqmax_new_j;
} }

View file

@ -206,7 +206,6 @@ static __global__ void flash_attn_vec_ext_f32(
for (int j = 0; j < ncols; ++j) { for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_new_arr[j]; float kqmax_new_j = kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j; kqmax_shared[j][threadIdx.y] = kqmax_new_j;
} }

View file

@ -310,14 +310,14 @@ void ggml_aligned_free(void * ptr, size_t size);
// FP16 to FP32 conversion // FP16 to FP32 conversion
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
#ifdef _MSC_VER #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
typedef uint16_t ggml_fp16_internal_t; typedef uint16_t ggml_fp16_internal_t;
#else #else
typedef __fp16 ggml_fp16_internal_t; typedef __fp16 ggml_fp16_internal_t;
#endif #endif
#endif #endif
#if defined(__ARM_NEON) && !defined(_MSC_VER) #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

View file

@ -102,6 +102,21 @@ typedef struct {
uint64_t nb3; uint64_t nb3;
} ggml_metal_kargs_cpy; } ggml_metal_kargs_cpy;
typedef struct {
int64_t ne10;
int64_t ne11;
int64_t ne12;
uint64_t nb10;
uint64_t nb11;
uint64_t nb12;
uint64_t nb13;
uint64_t nb1;
uint64_t nb2;
uint64_t nb3;
uint64_t offs;
bool inplace;
} ggml_metal_kargs_set;
typedef struct { typedef struct {
int32_t ne00; int32_t ne00;
int32_t ne01; int32_t ne01;
@ -192,6 +207,30 @@ typedef struct {
int16_t r3; int16_t r3;
} ggml_metal_kargs_mul_mv; } ggml_metal_kargs_mul_mv;
typedef struct {
int32_t ne00;
int32_t ne01;
int32_t ne02;
uint64_t nb00;
uint64_t nb01;
uint64_t nb02;
uint64_t nb03;
int32_t ne10;
int32_t ne11;
int32_t ne12;
uint64_t nb10;
uint64_t nb11;
uint64_t nb12;
uint64_t nb13;
int32_t ne0;
int32_t ne1;
int16_t r2;
int16_t r3;
int16_t nsg;
int16_t nxpsg;
int16_t r1ptg;
} ggml_metal_kargs_mul_mv_ext;
typedef struct { typedef struct {
int32_t nei0; int32_t nei0;
int32_t nei1; int32_t nei1;

View file

@ -175,6 +175,46 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
@ -266,8 +306,11 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_IM2COL_F32,
GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16, GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32, GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32,
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32,
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32, GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@ -329,6 +372,8 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
GGML_METAL_KERNEL_TYPE_SET_I32,
GGML_METAL_KERNEL_TYPE_SET_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
@ -350,6 +395,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_SUM_ROWS,
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
GGML_METAL_KERNEL_TYPE_ARGMAX,
GGML_METAL_KERNEL_TYPE_COUNT GGML_METAL_KERNEL_TYPE_COUNT
}; };
@ -464,6 +510,35 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
#endif #endif
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
if (path_lib == nil) {
// Try to find the resource in the directory where the current binary located.
NSString * current_binary = [[NSProcessInfo processInfo] arguments][0];
NSString * bin_dir = [current_binary stringByDeletingLastPathComponent];
NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]);
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error];
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
// Optionally, if this is a symlink, try to resolve it.
default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error];
if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) {
// It is a relative path, adding the binary directory as directory prefix.
default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]];
}
if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) {
// Link to the resource could not be resolved.
default_metallib_path = nil;
} else {
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]);
}
}
} else {
// The resource couldn't be found in the binary's directory.
default_metallib_path = nil;
}
path_lib = default_metallib_path;
}
if (try_metallib && path_lib != nil) { if (try_metallib && path_lib != nil) {
// pre-compiled library found // pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:path_lib]; NSURL * libURL = [NSURL fileURLWithPath:path_lib];
@ -699,6 +774,46 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2, mul_mv_ext_f16_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3, mul_mv_ext_f16_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4, mul_mv_ext_f16_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5, mul_mv_ext_f16_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2, mul_mv_ext_q4_0_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3, mul_mv_ext_q4_0_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4, mul_mv_ext_q4_0_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5, mul_mv_ext_q4_0_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2, mul_mv_ext_q4_1_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3, mul_mv_ext_q4_1_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4, mul_mv_ext_q4_1_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5, mul_mv_ext_q4_1_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2, mul_mv_ext_q5_0_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3, mul_mv_ext_q5_0_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4, mul_mv_ext_q5_0_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5, mul_mv_ext_q5_0_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2, mul_mv_ext_q5_1_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3, mul_mv_ext_q5_1_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4, mul_mv_ext_q5_1_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5, mul_mv_ext_q5_1_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2, mul_mv_ext_q8_0_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3, mul_mv_ext_q8_0_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4, mul_mv_ext_q8_0_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5, mul_mv_ext_q8_0_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2, mul_mv_ext_q4_K_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3, mul_mv_ext_q4_K_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4, mul_mv_ext_q4_K_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5, mul_mv_ext_q4_K_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2, mul_mv_ext_q5_K_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3, mul_mv_ext_q5_K_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4, mul_mv_ext_q5_K_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5, mul_mv_ext_q5_K_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2, mul_mv_ext_q6_K_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3, mul_mv_ext_q6_K_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4, mul_mv_ext_q6_K_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5, mul_mv_ext_q6_K_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2, mul_mv_ext_iq4_nl_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3, mul_mv_ext_iq4_nl_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4, mul_mv_ext_iq4_nl_f32_r1_4, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5, mul_mv_ext_iq4_nl_f32_r1_5, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, has_simdgroup_reduction);
@ -790,8 +905,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16, im2col_ext_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16, im2col_ext_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32, im2col_ext_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F32, im2col_ext_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32, conv_transpose_1d_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@ -853,6 +971,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat);
@ -872,6 +992,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true);
} }
@ -989,6 +1110,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_CONV_TRANSPOSE_1D:
return true; return true;
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SQRT: case GGML_OP_SQRT:
@ -1001,6 +1123,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return has_simdgroup_reduction; return has_simdgroup_reduction;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0); return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
case GGML_OP_ARGMAX:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ROPE: case GGML_OP_ROPE:
return true; return true;
@ -1011,6 +1134,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
@ -1068,6 +1192,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return false; return false;
}; };
} }
case GGML_OP_SET:
{
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_I32:
return true;
default:
return false;
};
}
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
@ -1928,30 +2062,180 @@ static void ggml_metal_encode_node(
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel // to the matrix-vector kernel
int ne11_mm_min = 4; const int ne11_mm_min = 4;
#if 0 // first try to use small-batch mat-mv kernels
// the numbers below are measured on M2 Ultra for 7B and 13B models // these should be efficient for BS [2, ~8]
// these numbers do not translate to other devices or model sizes if (src1t == GGML_TYPE_F32 && (ne00%256 == 0) &&
// TODO: need to find a better approach (
if ([device.name isEqualToString:@"Apple M2 Ultra"]) { (
switch (src0t) { (
case GGML_TYPE_F16: ne11_mm_min = 2; break; src0t == GGML_TYPE_F16 || // TODO: helper function
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; src0t == GGML_TYPE_Q4_0 ||
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; src0t == GGML_TYPE_Q4_1 ||
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; src0t == GGML_TYPE_Q5_0 ||
src0t == GGML_TYPE_Q5_1 ||
src0t == GGML_TYPE_Q8_0 ||
src0t == GGML_TYPE_IQ4_NL ||
false) && (ne11 >= 2 && ne11 <= 8)
) ||
(
(
src0t == GGML_TYPE_Q4_K ||
src0t == GGML_TYPE_Q5_K ||
src0t == GGML_TYPE_Q6_K ||
false) && (ne11 >= 4 && ne11 <= 8)
)
)
) {
// TODO: determine the optimal parameters based on grid utilization
// I still don't know why we should not always use the maximum available threads:
//
// nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
//
// my current hypothesis is that the work grid is not evenly divisible for different nsg
// values and there can be some tail effects when nsg is high. need to confirm this
//
const int nsg = 2; // num simdgroups per threadgroup
const int nxpsg = ne11 < 3 ? 16 : 8; // num threads along row per simdgroup
const int nypsg = 32/nxpsg; // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
const int r0ptg = nypsg*nsg; // num src0 rows per threadgroup
int r1ptg = 4; // num src1 rows per threadgroup
// note: not sure how optimal are those across all different hardware. there might be someting cleverer
switch (ne11) {
case 2:
r1ptg = 2; break;
case 3:
case 6:
r1ptg = 3; break;
case 4:
case 7:
case 8:
r1ptg = 4; break;
case 5:
r1ptg = 5; break;
};
id<MTLComputePipelineState> pipeline = nil;
switch (src0->type) {
case GGML_TYPE_F16:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; switch (r1ptg) {
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_2].pipeline; break;
case GGML_TYPE_Q5_0: // not tested yet case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_3].pipeline; break;
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_4].pipeline; break;
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_0_F32_R1_5].pipeline; break;
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; default: GGML_ABORT("not implemented");
default: ne11_mm_min = 1; break; } break;
case GGML_TYPE_Q4_1:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_1_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q5_0:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_0_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q5_1:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_1_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q8_0:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q8_0_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q4_K:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q4_K_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q5_K:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q5_K_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_Q6_K:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_Q6_K_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
case GGML_TYPE_IQ4_NL:
switch (r1ptg) {
case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_2].pipeline; break;
case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_3].pipeline; break;
case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_4].pipeline; break;
case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_IQ4_NL_F32_R1_5].pipeline; break;
default: GGML_ABORT("not implemented");
} break;
default: GGML_ABORT("not implemented");
} }
}
#endif
ggml_metal_kargs_mul_mv_ext args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne10 =*/ ne10,
/*.ne11 =*/ ne11,
/*.ne12 =*/ ne12,
/*.nb10 =*/ nb10,
/*.nb11 =*/ nb11,
/*.nb12 =*/ nb12,
/*.nb13 =*/ nb13,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.r2 =*/ r2,
/*.r3 =*/ r3,
/*.nsg =*/ nsg,
/*.nxpsg =*/ nxpsg,
/*.r1ptg =*/ r1ptg,
};
[encoder setComputePipelineState:pipeline];
[encoder setBytes:&args length:sizeof(args) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
//printf("ne01 = %lld nr0ptg = %d\n", ne01, nr0ptg);
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + r0ptg - 1)/r0ptg, (ne11 + r1ptg - 1)/r1ptg, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
} else
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if ([device supportsFamily:MTLGPUFamilyApple7] && if ([device supportsFamily:MTLGPUFamilyApple7] &&
@ -2908,6 +3192,49 @@ static void ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)]; [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
} }
} break; } break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t IC = src1->ne[1];
const int32_t IL = src1->ne[0];
const int32_t K = src0->ne[0];
const int32_t OL = dst->ne[0];
const int32_t OC = dst->ne[1];
id<MTLComputePipelineState> pipeline;
switch (src0->type) {
case GGML_TYPE_F32: {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F32_F32].pipeline;
} break;
case GGML_TYPE_F16: {
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32].pipeline;
} break;
default: GGML_ABORT("fatal error");
};
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&IC length:sizeof( int32_t) atIndex:3];
[encoder setBytes:&IL length:sizeof( int32_t) atIndex:4];
[encoder setBytes:&K length:sizeof( int32_t) atIndex:5];
[encoder setBytes:&s0 length:sizeof( int32_t) atIndex:6];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:8];
[encoder dispatchThreadgroups:MTLSizeMake(OL, OC, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
{ {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -2977,6 +3304,38 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0); const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_PAD_REFLECT_1D:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int32_t p0 = ((const int32_t *)(dst->op_params))[0];
const int32_t p1 = ((const int32_t *)(dst->op_params))[1];
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
[encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
[encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
@ -3508,6 +3867,68 @@ static void ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_SET:
{
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
// src0 and dst as viewed during set
const size_t dst_nb0 = ggml_element_size(src0);
const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
const size_t offset = ((int32_t *) dst->op_params)[3];
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst));
}
const int im0 = (ne10 == 0 ? 0 : ne10-1);
const int im1 = (ne11 == 0 ? 0 : ne11-1);
const int im2 = (ne12 == 0 ? 0 : ne12-1);
const int im3 = (ne13 == 0 ? 0 : ne13-1);
GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst));
id<MTLComputePipelineState> pipeline = nil;
switch (src0t) {
case GGML_TYPE_F32:
GGML_ASSERT(nb10 == sizeof(float));
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
case GGML_TYPE_I32:
GGML_ASSERT(nb10 == sizeof(int32_t));
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
default: GGML_ABORT("fatal error");
}
ggml_metal_kargs_set args = {
/*.ne10 =*/ ne10,
/*.ne11 =*/ ne11,
/*.ne12 =*/ ne12,
/*.nb10 =*/ nb10,
/*.nb11 =*/ nb11,
/*.nb12 =*/ nb12,
/*.nb13 =*/ nb13,
/*.nb1 =*/ dst_nb1,
/*.nb2 =*/ dst_nb2,
/*.nb3 =*/ dst_nb3,
/*.offs =*/ offset,
/*.inplace =*/ inplace,
};
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
[encoder setComputePipelineState:pipeline];
[encoder setBytes:&args length:sizeof(args) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
@ -3567,6 +3988,31 @@ static void ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
} break; } break;
case GGML_OP_ARGMAX:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
const int64_t nrows = ggml_nrows(src0);
int nth = 32; // SIMD width
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
nth *= 2;
}
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGMAX].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder setThreadgroupMemoryLength:32*sizeof(int32_t) atIndex:1];
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
default: default:
{ {
GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));

View file

@ -47,6 +47,11 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
reg = (type4x4)(*src); reg = (type4x4)(*src);
} }
template <typename type4>
void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) {
reg = (type4)(*(src + il));
}
#if defined(GGML_METAL_USE_BF16) #if defined(GGML_METAL_USE_BF16)
template <typename type4x4> template <typename type4x4>
void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) { void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
@ -73,6 +78,21 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
reg = (type4x4) reg_f; reg = (type4x4) reg_f;
} }
template <typename type4>
void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
const float md = -8.h * xb->d;
const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
const ushort mask1 = mask0 << 8;
for (int i = 0; i < 2; i++) {
reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md;
reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md;
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) { void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2); device const uint16_t * qs = ((device const uint16_t *)xb + 2);
@ -92,6 +112,21 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
reg = (type4x4) reg_f; reg = (type4x4) reg_f;
} }
template <typename type4>
void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
const float m = xb->m;
const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
const ushort mask1 = mask0 << 8;
for (int i = 0; i < 2; i++) {
reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m;
reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m;
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) { void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 3); device const uint16_t * qs = ((device const uint16_t *)xb + 3);
@ -124,6 +159,36 @@ void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg
reg = (type4x4) reg_f; reg = (type4x4) reg_f;
} }
template <typename type4>
void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
const float d = xb->d;
const float md = -16.h * xb->d;
const ushort mask = (il/4) ? 0x00F0 : 0x000F;
const uint32_t qh = *((device const uint32_t *)xb->qh);
const int x_mv = (il/4) ? 4 : 0;
const int gh_mv = (il/4) ? 12 : 0;
const int gh_bk = (il/4) ? 0 : 4;
for (int ii = 0; ii < 2; ii++) {
int i = 2*(il%4) + ii;
// extract the 5-th bits for x0 and x1
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
// combine the 4-bits from qs with the 5th bit
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
reg[2*ii + 0] = d * x0 + md;
reg[2*ii + 1] = d * x1 + md;
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) { void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 4); device const uint16_t * qs = ((device const uint16_t *)xb + 4);
@ -156,10 +221,40 @@ void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg
reg = (type4x4) reg_f; reg = (type4x4) reg_f;
} }
template <typename type4>
void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
const float d = xb->d;
const float m = xb->m;
const ushort mask = (il/4) ? 0x00F0 : 0x000F;
const uint32_t qh = *((device const uint32_t *)xb->qh);
const int x_mv = (il/4) ? 4 : 0;
const int gh_mv = (il/4) ? 12 : 0;
const int gh_bk = (il/4) ? 0 : 4;
for (int ii = 0; ii < 2; ii++) {
int i = 2*(il%4) + ii;
// extract the 5-th bits for x0 and x1
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
// combine the 4-bits from qs with the 5th bit
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
reg[2*ii + 0] = d * x0 + m;
reg[2*ii + 1] = d * x1 + m;
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
device const int8_t * qs = ((device const int8_t *)xb->qs); device const int8_t * qs = ((device const int8_t *)xb->qs);
const half d = xb->d; const float d = xb->d;
float4x4 reg_f; float4x4 reg_f;
@ -170,6 +265,16 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg
reg = (type4x4) reg_f; reg = (type4x4) reg_f;
} }
template <typename type4>
void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) {
device const int8_t * qs = ((device const int8_t *)xb->qs);
const float d = xb->d;
for (int i = 0; i < 4; i++) {
reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d);
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
const float d = xb->d; const float d = xb->d;
@ -469,6 +574,19 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
} }
} }
template <typename type4>
void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) {
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
const float d = xb->d;
uint32_t aux32;
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f;
reg[0] = d * kvalues_iq4nl_f[q8[0]];
reg[1] = d * kvalues_iq4nl_f[q8[1]];
reg[2] = d * kvalues_iq4nl_f[q8[2]];
reg[3] = d * kvalues_iq4nl_f[q8[3]];
}
template <typename type4x4> template <typename type4x4>
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) { void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2 // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
@ -1248,6 +1366,63 @@ kernel void kernel_ssm_scan_f32(
} }
} }
kernel void kernel_argmax(
device const void * x,
device int32_t * dst,
constant int64_t & ncols,
constant uint64_t & nb01,
threadgroup float * shared_maxval [[threadgroup(0)]],
threadgroup int32_t * shared_argmax [[threadgroup(1)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01);
float lmax = -INFINITY;
int32_t larg = -1;
for (int i00 = tpitg; i00 < ncols; i00 += ntg) {
if (x_row[i00] > lmax) {
lmax = x_row[i00];
larg = i00;
}
}
// find the argmax value in the block
float max_val = simd_max(lmax);
int32_t arg_val = simd_max(select(-1, larg, lmax == max_val));
if (ntg > N_SIMDWIDTH) {
if (sgitg == 0) {
shared_maxval[tiisg] = -INFINITY;
shared_argmax[tiisg] = -1;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tiisg == 0) {
shared_maxval[sgitg] = max_val;
shared_argmax[sgitg] = arg_val;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
max_val = shared_maxval[tiisg];
arg_val = shared_argmax[tiisg];
float max_val_reduced = simd_max(max_val);
int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced));
dst[tgpig] = arg_val_reduced;
return;
}
dst[tgpig] = arg_val;
}
kernel void kernel_norm( kernel void kernel_norm(
constant ggml_metal_kargs_norm & args, constant ggml_metal_kargs_norm & args,
device const char * src0, device const char * src0,
@ -1752,6 +1927,301 @@ kernel void kernel_mul_mv_q8_0_f32(
kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg); kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
} }
// mat-vec kernel processing in chunks of float4
// chpb - chunks per quantization block
template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4)(device const q_t *, short, thread float4 &) >
void kernel_mul_mv_ext_q4_f32_impl(
constant ggml_metal_kargs_mul_mv_ext & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
const short chpt = 4; // chunks per thread
//const short nxpsg = (32);
const short nypsg = (32/nxpsg);
const short tx = tiisg%nxpsg;
const short ty = tiisg/nxpsg;
const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
const int i11 = tgpig.y*r1ptg;
const int i1m = tgpig.z;
const int i12 = i1m%args.ne12;
const int i13 = i1m/args.ne12;
const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
device const float4 * y4[r1ptg];
for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
y4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4 *) src1;
}
float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
short cch = tx%chpb; // current chunk index
for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) {
float4 lx[chpt];
#pragma unroll(chpt)
for (short ch = 0; ch < chpt; ++ch) {
deq_t4(xq, cch, lx[ch]);
cch += nxpsg;
if (cch >= chpb) {
xq += cch/chpb;
cch %= chpb;
}
}
#pragma unroll(chpt)
for (short ch = 0; ch < chpt; ++ch) {
#pragma unroll(r1ptg)
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
sumf[ir1] += dot(lx[ch], y4[ir1][ch*nxpsg]);
}
}
#pragma unroll(r1ptg)
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
y4[ir1] += chpt*nxpsg;
}
}
// reduce only the threads in each row
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
if (nxpsg >= 32) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
}
if (nxpsg >= 16) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 8);
}
if (nxpsg >= 8) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 4);
}
if (nxpsg >= 4) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 2);
}
if (nxpsg >= 2) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 1);
}
//sumf[ir1] = simd_sum(sumf[ir1]);
}
if (tx == 0) {
for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
if (i01 < args.ne01) {
dst_f32[i01] = sumf[ir1];
}
}
}
}
// mat-vec kernel processing in chunks of float4x4
template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &) >
void kernel_mul_mv_ext_q4x4_f32_impl(
constant ggml_metal_kargs_mul_mv_ext & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
const short chpt = 1;
//const short nxpsg = (32);
const short nypsg = (32/nxpsg);
const short tx = tiisg%nxpsg;
const short ty = tiisg/nxpsg;
const int i01 = tgpig.x*(nypsg*args.nsg) + nypsg*sgitg + ty;
const int i11 = tgpig.y*r1ptg;
const int i1m = tgpig.z;
const int i12 = i1m%args.ne12;
const int i13 = i1m/args.ne12;
const uint64_t offset0 = i01*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
const uint64_t offset1 = i11*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
device const q_t * xq = (i01 < args.ne01) ? (device const q_t *) (src0 + offset0) + tx/chpb : (device const q_t *) src0;
device const float4x4 * y4x4[r1ptg];
for (int ir1 = 0; ir1 < r1ptg; ++ir1) {
y4x4[ir1] = (i11 + ir1 < args.ne11) ? (device const float4x4 *) (src1 + offset1 + ir1*args.nb11) + tx : (device const float4x4 *) src1;
}
float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
short cch = tx%chpb;
for (int ich = tx; 16*ich < args.ne00; ich += chpt*nxpsg) {
float4x4 lx[chpt];
#pragma unroll(chpt)
for (short ch = 0; ch < chpt; ++ch) {
deq_t4x4(xq, cch, lx[ch]);
cch += nxpsg;
if (cch >= chpb) {
xq += cch/chpb;
cch %= chpb;
}
}
#pragma unroll(chpt)
for (short ch = 0; ch < chpt; ++ch) {
#pragma unroll(r1ptg)
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
sumf[ir1] +=
dot(lx[ch][0], y4x4[ir1][ch*nxpsg][0]) +
dot(lx[ch][1], y4x4[ir1][ch*nxpsg][1]) +
dot(lx[ch][2], y4x4[ir1][ch*nxpsg][2]) +
dot(lx[ch][3], y4x4[ir1][ch*nxpsg][3]);
}
}
#pragma unroll(r1ptg)
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
y4x4[ir1] += chpt*nxpsg;
}
}
for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
if (nxpsg >= 32) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
}
if (nxpsg >= 16) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 8);
}
if (nxpsg >= 8) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 4);
}
if (nxpsg >= 4) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 2);
}
if (nxpsg >= 2) {
sumf[ir1] += simd_shuffle_down(sumf[ir1], 1);
}
//sumf[ir1] = simd_sum(sumf[ir1]);
}
if (tx == 0) {
for (short ir1 = 0; ir1 < r1ptg && i11 + ir1 < args.ne11; ++ir1) {
device float * dst_f32 = (device float *) dst + (uint64_t)i1m*args.ne0*args.ne1 + (uint64_t)(i11 + ir1)*args.ne0;
if (i01 < args.ne01) {
dst_f32[i01] = sumf[ir1];
}
}
}
}
// dispatchers needed for compile-time nxpsg
// epb - elements per quantization block
template<short r1ptg, typename q_t, short epb, void (*deq_t4)(device const q_t *, short, thread float4 &)>
kernel void kernel_mul_mv_ext_q4_f32_disp(
constant ggml_metal_kargs_mul_mv_ext & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
switch (args.nxpsg) {
case 4: kernel_mul_mv_ext_q4_f32_impl<4, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 8: kernel_mul_mv_ext_q4_f32_impl<8, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 16: kernel_mul_mv_ext_q4_f32_impl<16, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 32: kernel_mul_mv_ext_q4_f32_impl<32, r1ptg, q_t, epb/4, deq_t4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
}
}
template<short r1ptg, typename q_t, short epb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &)>
kernel void kernel_mul_mv_ext_q4x4_f32_disp(
constant ggml_metal_kargs_mul_mv_ext & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
switch (args.nxpsg) {
case 4: kernel_mul_mv_ext_q4x4_f32_impl<4, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 8: kernel_mul_mv_ext_q4x4_f32_impl<8, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 16: kernel_mul_mv_ext_q4x4_f32_impl<16, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
case 32: kernel_mul_mv_ext_q4x4_f32_impl<32, r1ptg, q_t, epb/16, deq_t4x4>(args, src0, src1, dst, tgpig, tiisg, sgitg); break;
}
}
typedef decltype(kernel_mul_mv_ext_q4_f32_disp <2, block_q8_0, 32, dequantize_q8_0_t4>) mul_mv_ext_q4_f32_t;
typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>) mul_mv_ext_q4x4_f32_t;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_1, 32, dequantize_q4_1_t4>;
template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_1, 32, dequantize_q4_1_t4>;
template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_1, 32, dequantize_q4_1_t4>;
template [[host_name("kernel_mul_mv_ext_q4_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q4_1, 32, dequantize_q4_1_t4>;
template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_0, 32, dequantize_q5_0_t4>;
template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_0, 32, dequantize_q5_0_t4>;
template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_0, 32, dequantize_q5_0_t4>;
template [[host_name("kernel_mul_mv_ext_q5_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_0, 32, dequantize_q5_0_t4>;
template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q5_1, 32, dequantize_q5_1_t4>;
template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q5_1, 32, dequantize_q5_1_t4>;
template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q5_1, 32, dequantize_q5_1_t4>;
template [[host_name("kernel_mul_mv_ext_q5_1_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q5_1, 32, dequantize_q5_1_t4>;
template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q8_0, 32, dequantize_q8_0_t4>;
template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q8_0, 32, dequantize_q8_0_t4>;
template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q8_0, 32, dequantize_q8_0_t4>;
template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_q8_0, 32, dequantize_q8_0_t4>;
template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
template [[host_name("kernel_mul_mv_ext_iq4_nl_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, block_iq4_nl, 32, dequantize_iq4_nl_t4>;
template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q4_K, 256, dequantize_q4_K>;
template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q4_K, 256, dequantize_q4_K>;
template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q4_K, 256, dequantize_q4_K>;
template [[host_name("kernel_mul_mv_ext_q4_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q4_K, 256, dequantize_q4_K>;
template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_K, 256, dequantize_q5_K>;
template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_K, 256, dequantize_q5_K>;
template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>;
template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
#define N_MV_T_T 4 #define N_MV_T_T 4
template<typename T0, typename T04, typename T1, typename T14, typename args_t> template<typename T0, typename T04, typename T1, typename T14, typename args_t>
@ -2258,6 +2728,79 @@ kernel void kernel_im2col_ext(
template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>; template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>; template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
typedef void (conv_transpose_1d_t)(
device const float * src0,
device const float * src1,
device char * dst,
constant int32_t & IC,
constant int32_t & IL,
constant int32_t & K,
constant int32_t & s0,
constant uint64_t & nb0,
constant uint64_t & nb1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]]);
template <typename T>
kernel void kernel_conv_transpose_1d(
device const T * src0,
device const float * src1,
device char * dst,
constant int32_t & IC,
constant int32_t & IL,
constant int32_t & K,
constant int32_t & s0,
constant uint64_t & nb0,
constant uint64_t & nb1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]]) {
float v = 0.0f;
for (int64_t c = 0; c < IC; c++) {
const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1];
const int32_t input_offset = c * IL;
for (int64_t i = 0; i < IL; i++) {
if (tgpig[0] >= i * s0 && tgpig[0] < i * s0 + K) {
v += src0[kernel_offset + tgpig[0] - i * s0] * src1[input_offset + i];
}
}
}
device float * dst_ptr = (device float *) (dst + tgpig[0] * nb0 + tgpig[1] * nb1);
dst_ptr[0] = v;
}
template [[host_name("kernel_conv_transpose_1d_f32_f32")]]
kernel void kernel_conv_transpose_1d<float>(
device const float * src0,
device const float * src1,
device char * dst,
constant int32_t & IC,
constant int32_t & IL,
constant int32_t & K,
constant int32_t & s0,
constant uint64_t & nb0,
constant uint64_t & nb1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]]);
template [[host_name("kernel_conv_transpose_1d_f16_f32")]]
kernel void kernel_conv_transpose_1d<half>(
device const half * src0,
device const float * src1,
device char * dst,
constant int32_t & IC,
constant int32_t & IL,
constant int32_t & K,
constant int32_t & s0,
constant uint64_t & nb0,
constant uint64_t & nb1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]]);
kernel void kernel_upscale_f32( kernel void kernel_upscale_f32(
device const char * src0, device const char * src0,
device char * dst, device char * dst,
@ -2354,6 +2897,53 @@ kernel void kernel_pad_f32(
} }
} }
kernel void kernel_pad_reflect_1d_f32(
device const char * src0,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & ne0,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & p0,
constant int32_t & p1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
if (i0 < p0) {
dst_ptr[i0] = src0_ptr[p0 - i0];
} else if (i0 < ne0 - p1) {
dst_ptr[i0] = src0_ptr[i0 - p0];
} else {
dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
}
}
}
}
kernel void kernel_arange_f32( kernel void kernel_arange_f32(
device char * dst, device char * dst,
constant int64_t & ne0, constant int64_t & ne0,
@ -3337,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_
#undef FA_TYPES #undef FA_TYPES
template<typename T>
kernel void kernel_set(
constant ggml_metal_kargs_set & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i13 = tgpig[2];
const int i12 = tgpig[1];
const int i11 = tgpig[0];
const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
dst_data[i10] = (T) src[0];
}
}
typedef decltype(kernel_set<float>) kernel_set_t;
template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
template<typename T0, typename T1> template<typename T0, typename T1>
kernel void kernel_cpy( kernel void kernel_cpy(
constant ggml_metal_kargs_cpy & args, constant ggml_metal_kargs_cpy & args,

View file

@ -1689,9 +1689,14 @@ namespace dpct
auto data_a = get_memory<const Ta>(a); auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b); auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c); auto data_c = get_memory<Tc>(c);
oneapi::mkl::blas::column_major::gemm( #ifdef GGML_SYCL_NVIDIA
q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
data_b, ldb, beta_value, data_c, ldc); a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
beta_value, data_c, ldc);
#else
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
beta_value, data_c, ldc);
#endif
} }
template <typename VecT, class BinaryOperation, class = void> template <typename VecT, class BinaryOperation, class = void>
@ -1754,14 +1759,22 @@ namespace dpct
matrix_info->ld_info[2] = ldc; matrix_info->ld_info[2] = ldc;
matrix_info->groupsize_info = batch_size; matrix_info->groupsize_info = batch_size;
#ifdef GGML_SYCL_NVIDIA
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
matrix_info->size_info + 2, matrix_info->value_info, matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), &(matrix_info->groupsize_info));
#else
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#endif
q.submit([&](sycl::handler &cgh) q.submit([&](sycl::handler &cgh)
{ {
@ -1783,10 +1796,16 @@ namespace dpct
auto data_a = get_memory<const Ta>(a); auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b); auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c); auto data_c = get_memory<Tc>(c);
#ifdef GGML_SYCL_NVIDIA
oneapi::mkl::blas::column_major::gemm_batch( oneapi::mkl::blas::column_major::gemm_batch(
q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
stride_a, data_b, ldb, stride_b, beta_value, alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
data_c, ldc, stride_c, batch_size); batch_size);
#else
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
stride_c, batch_size);
#endif
} }
} // namespace detail } // namespace detail

View file

@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.device_count = dpct::dev_mgr::instance().device_count(); info.device_count = dpct::dev_mgr::instance().device_count();
if (info.device_count == 0) { if (info.device_count == 0) {
fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__); GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
return info; return info;
} }
@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_SYCL_FORCE_MMQ) #if defined(GGML_SYCL_FORCE_MMQ)
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
#else #else
fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
#endif #endif
#if defined(SYCL_USE_XMX) #if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
#else #else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
#endif #endif
fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
for (int i = 0; i < info.device_count; ++i) { for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0; info.devices[i].vmm = 0;
@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
auto global_mem_size = prop.get_global_mem_size()/1000000; auto global_mem_size = prop.get_global_mem_size()/1000000;
fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(), name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
@ -120,18 +120,29 @@ void ggml_backend_sycl_print_sycl_devices() {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
int device_count = dpct::dev_mgr::instance().device_count(); int device_count = dpct::dev_mgr::instance().device_count();
std::map<std::string, size_t> DeviceNums; std::map<std::string, size_t> DeviceNums;
fprintf(stderr, "found %d SYCL devices:\n", device_count); GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n"); GGML_LOG_INFO(
fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n"); "| | | | "
fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n"); " |Max | |Max |Global | |\n");
GGML_LOG_INFO(
"| | | | "
" |compute|Max work|sub |mem | |\n");
GGML_LOG_INFO(
"|ID| Device Type| "
"Name|Version|units |group |group|size | Driver version|\n");
GGML_LOG_INFO(
"|--|-------------------|---------------------------------------|------"
"-|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
sycl::backend backend = device.get_backend(); sycl::backend backend = device.get_backend();
std::string backend_type = get_device_backend_and_type(device); std::string backend_type = get_device_backend_and_type(device);
int type_id = DeviceNums[backend_type]++; int type_id = DeviceNums[backend_type]++;
std::stringstream device_type; std::stringstream device_type;
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; device_type << "[" << backend_type << ":" << std::to_string(type_id)
<< "]";
print_device_detail(id, device, device_type.str()); print_device_detail(id, device, device_type.str());
} }
} }
@ -154,15 +165,14 @@ static void ggml_check_sycl() try {
static bool initialized = false; static bool initialized = false;
if (!initialized) { if (!initialized) {
fprintf(stderr, "[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
#if defined(GGML_SYCL_F16) #if defined(GGML_SYCL_F16)
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
#else #else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
#endif #endif
/* NOT REMOVE, keep it for next optimize for XMX. /* NOT REMOVE, keep it for next optimize for XMX.
@ -180,9 +190,10 @@ static void ggml_check_sycl() try {
return; return;
} }
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();
initialized = true; initialized = true;
g_sycl_loaded = true; g_sycl_loaded = true;
ggml_backend_sycl_print_sycl_devices();
} }
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
@ -205,7 +216,7 @@ inline void check_allow_gpu_index(const int device_index) {
__func__, __func__,
device_index, device_index,
ggml_sycl_info().device_count - 1); ggml_sycl_info().device_count - 1);
fprintf(stderr, "%s\n", error_buf); GGML_LOG_ERROR("%s\n", error_buf);
assert(false); assert(false);
} }
} }
@ -475,7 +486,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream))); size, *stream)));
if (!dev_ptr) { if (!dev_ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size); GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr; return nullptr;
} }
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
@ -752,7 +763,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
size, *stream))); size, *stream)));
if (!buf) { if (!buf) {
char err_buf[1024]; char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size); snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
throw std::runtime_error(err_buf); throw std::runtime_error(err_buf);
} }
// set padding to 0 to avoid possible NaN values // set padding to 0 to avoid possible NaN values
@ -1142,7 +1153,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
look_ahead_size, *qptr))); look_ahead_size, *qptr)));
if (!ptr) { if (!ptr) {
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size); GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
return nullptr; return nullptr;
} }
@ -1150,9 +1161,10 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
pool_size += look_ahead_size; pool_size += look_ahead_size;
#ifdef DEBUG_SYCL_MALLOC #ifdef DEBUG_SYCL_MALLOC
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
#endif #endif
// GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr); // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
return ptr; return ptr;
} }
@ -1166,7 +1178,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
return; return;
} }
} }
fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
pool_size -= size; pool_size -= size;
} }
@ -2437,7 +2449,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
break; break;
default: default:
// TODO: k-quants // TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
break; break;
} }
@ -2561,12 +2573,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
const float alpha = 1.0f; const float alpha = 1.0f;
const float beta = 0.0f; const float beta = 0.0f;
#if !GGML_SYCL_DNNL #if !GGML_SYCL_DNNL
# ifdef GGML_SYCL_NVIDIA
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), # else
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
dst_dd_i, ldc))); dst_dd_i, ldc)));
# endif
#else #else
auto dnnl_stream = ctx.stream_dnnl(stream); auto dnnl_stream = ctx.stream_dnnl(stream);
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(), DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
@ -3750,7 +3767,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else { } else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
@ -3825,7 +3842,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(main_device)))); prop, dpct::dev_mgr::instance().get_device(main_device))));
fprintf(stderr, "Using device %d (%s) as main device\n", GGML_LOG_INFO("Using device %d (%s) as main device\n",
main_device, prop.get_name()); main_device, prop.get_name());
} }
} }
@ -4172,7 +4189,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
#endif #endif
bool ok = ggml_sycl_compute_forward(*sycl_ctx, node); bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
if (!ok) { if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
} }
GGML_ASSERT(ok); GGML_ASSERT(ok);
} }
@ -4672,7 +4689,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device); ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
if (ctx == nullptr) { if (ctx == nullptr) {
fprintf(stderr, "%s: error: failed to allocate context\n", __func__); GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return nullptr; return nullptr;
}; };

View file

@ -40,14 +40,14 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr
try { try {
// Perform matrix multiplication using oneMKL GEMM // Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, #ifdef GGML_SYCL_NVIDIA
oneapi::mkl::transpose::nontrans, src1_op, oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
ne0, ne1, ne01, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
alpha, ne00, src1_d, ldb, beta, dst_d, ne0);
src0_d, ne00, #else
src1_d, ldb, oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
beta, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
dst_d, ne0); #endif
} }
catch (sycl::exception const& exc) { catch (sycl::exception const& exc) {
std::cerr << exc.what() << std::endl; std::cerr << exc.what() << std::endl;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,305 @@
#include "types.comp"
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
block_q4_0_packed16 block;
};
float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
qs >>= shift;
qs &= 0xF;
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
block_q4_1 block;
};
float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const float16_t m = bl.block.m;
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
qs &= 0xF;
float16_t ret = float16_t(qs) * d + m;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
block_q5_0 block;
};
float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0];
const uint qh = ((uint_qh >> idx) << 4) & 0x10;
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
qs &= 0xF;
float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
block_q5_1 block;
};
float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const float16_t m = bl.block.m;
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint uint_qh = bl.block.qh;
const uint qh = ((uint_qh >> idx) << 4) & 0x10;
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
qs &= 0xF;
float16_t ret = float16_t(qs | qh) * d + m;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
block_q8_0_packed16 block;
};
float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx;
// Load 16b and select the byte for this element
int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1];
float16_t ret = float16_t(qs) * d;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
block_q2_K block;
};
float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const f16vec2 d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31
const uint scalesi = iqs / 16; // 0..15
const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6
uint32_t qs = bl.block.qs[qsi];
const uint scales = bl.block.scales[scalesi];
float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4);
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
block_q3_K block;
};
float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint n = iqs / 128; // 0,1
const uint qsi = n * 32 + (iqs % 32); // 0..63
const uint hmi = (iqs % 32); // 0..31
const uint j = (iqs % 128) / 8; // 0..15
const uint is = iqs / 16; // 0..15
const uint halfsplit = ((iqs % 128) / 32); // 0,1,2,3
const uint qsshift = halfsplit * 2; // 0,2,4,6
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
uint32_t scaleidx0 = (is < 8) ? is : (is-8);
uint32_t scaleidx0shift = (is < 8) ? 0 : 4;
uint32_t scaleidx1 = is + 8 - (is/4)*4;
uint32_t scaleidx1shift = (is/4)*2;
const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
const float16_t dl = bl.block.d * float16_t(us - 32);
float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi ] >> qsshift) & 3) - (((bl.block.hmask[hmi ] & m) != 0) ? 0 : 4));
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
block_q4_K block;
};
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint n = iqs / 64; // 0,1,2,3
const uint b = (iqs % 64) / 32; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
const uint qsi = n * 32 + (iqs % 32); // 0..127
const f16vec2 loadd = bl.block.d;
uint32_t sc;
uint32_t mbyte;
uint32_t scidx0 = (is < 4) ? is : (is + 4);
uint32_t scidx1 = (is < 4) ? is : (is - 4);
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
uint32_t mbidx0 = is + 4;
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);
uint32_t dmask = 0xF << (b * 4);
float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
block_q5_K block;
};
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint n = iqs / 64; // 0,1,2,3
const uint b = (iqs % 64) / 32; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
const uint qsi = n * 32 + (iqs % 32); // 0..127
const uint qhi = (iqs % 32); // 0..31
const uint8_t hm = uint8_t(1 << (iqs / 32));
const f16vec2 loadd = bl.block.d;
uint32_t sc;
uint32_t mbyte;
uint32_t scidx0 = (is < 4) ? is : (is + 4);
uint32_t scidx1 = (is < 4) ? is : (is - 4);
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
uint32_t mbidx0 = is + 4;
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);
uint32_t dmask = 0xF << (b * 4);
float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m;
return ret;
}
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
block_q6_K block;
};
float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint n = iqs / 128; // 0,1
const uint b = (iqs % 128) / 64; // 0,1
const uint is_b = (iqs % 32) / 16; // 0,1
const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
const uint is = 8 * n + qhshift + is_b; // 0..15
const uint qsi = n * 64 + (iqs % 64); // 0..127
const uint qhi = n * 32 + (iqs % 32); // 0..63
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32);
return ret;
}
#if defined(DATA_A_IQ4_NL)
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
block_iq4_nl block;
};
float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const float16_t d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx & 0xF;
const uint shift = (idx & 0x10) >> 2;
uint32_t qs = bl.block.qs[iqs];
qs >>= shift;
qs &= 0xF;
float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
return ret;
}
#endif
#if defined(DATA_A_Q4_0)
#define dequantFuncA dequantFuncQ4_0
#elif defined(DATA_A_Q4_1)
#define dequantFuncA dequantFuncQ4_1
#elif defined(DATA_A_Q5_0)
#define dequantFuncA dequantFuncQ5_0
#elif defined(DATA_A_Q5_1)
#define dequantFuncA dequantFuncQ5_1
#elif defined(DATA_A_Q8_0)
#define dequantFuncA dequantFuncQ8_0
#elif defined(DATA_A_Q2_K)
#define dequantFuncA dequantFuncQ2_K
#elif defined(DATA_A_Q3_K)
#define dequantFuncA dequantFuncQ3_K
#elif defined(DATA_A_Q4_K)
#define dequantFuncA dequantFuncQ4_K
#elif defined(DATA_A_Q5_K)
#define dequantFuncA dequantFuncQ5_K
#elif defined(DATA_A_Q6_K)
#define dequantFuncA dequantFuncQ6_K
#elif defined(DATA_A_IQ4_NL)
#define dequantFuncA dequantFuncIQ4_NL
#endif

View file

@ -0,0 +1,289 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_KHR_memory_scope_semantics : enable
#extension GL_KHR_cooperative_matrix : enable
#extension GL_NV_cooperative_matrix2 : enable
#extension GL_EXT_buffer_reference : enable
#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_vote : enable
#extension GL_EXT_null_initializer : enable
#include "types.comp"
#include "dequant_funcs_cm2.comp"
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 1) const uint32_t Br = 32;
layout (constant_id = 2) const uint32_t Bc = 32;
layout (constant_id = 3) const uint32_t D = 32;
layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV;
layout (push_constant) uniform parameter {
uint32_t N;
uint32_t KV;
uint32_t ne1;
uint32_t ne2;
uint32_t ne3;
uint32_t neq2;
uint32_t neq3;
uint32_t nek2;
uint32_t nek3;
uint32_t nev2;
uint32_t nev3;
uint32_t nem1;
uint32_t nb02;
uint32_t nb03;
uint32_t nb12;
uint32_t nb13;
uint32_t nb22;
uint32_t nb23;
uint32_t nb31;
float scale;
float max_bias;
float logit_softcap;
uint32_t mask;
uint32_t n_head_log2;
float m0;
float m1;
} p;
layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
layout (binding = 1) readonly buffer K {uint8_t data_k[];};
layout (binding = 2) readonly buffer V {uint8_t data_v[];};
layout (binding = 3) readonly buffer M {uint8_t data_m[];};
layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
return max(x, y);
}
ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
return x;
}
// Replace matrix elements >= numRows or numCols with 'replace'
ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) {
if (row >= numRows || col >= numCols) {
return replace;
}
return elem;
}
ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem)
{
return exp(elem);
}
ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1)
{
return max(elem0, elem1);
}
#if defined(BLOCK_SIZE)
#define DECODEFUNC , DEQUANTFUNC
#else
#define DECODEFUNC
#endif
void main() {
#if defined(DATA_A_IQ4_NL)
init_iq4nl_shmem();
#endif
const uint32_t N = p.N;
const uint32_t KV = p.KV;
const uint32_t Tr = CEIL_DIV(N, Br);
const uint32_t Tc = CEIL_DIV(KV, Bc);
const uint32_t i = gl_WorkGroupID.x;
const uint32_t iq2 = gl_WorkGroupID.y;
const uint32_t iq3 = gl_WorkGroupID.z;
// broadcast factors
const uint32_t rk2 = p.neq2/p.nek2;
const uint32_t rk3 = p.neq3/p.nek3;
const uint32_t rv2 = p.neq2/p.nev2;
const uint32_t rv3 = p.neq3/p.nev3;
// k indices
const uint32_t ik3 = iq3 / rk3;
const uint32_t ik2 = iq2 / rk2;
// v indices
const uint32_t iv3 = iq3 / rv3;
const uint32_t iv2 = iq2 / rv2;
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp);
tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp);
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
#if defined(BLOCK_SIZE)
tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE);
tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
#endif
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D);
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Q;
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D));
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA>(Q);
Qf16 *= float16_t(p.scale);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
ACC_TYPE slope = ACC_TYPE(1.0);
// ALiBi
if (p.max_bias > 0.0f) {
const uint32_t h = iq2;
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
slope = pow(base, ACC_TYPE(exph));
}
[[dont_unroll]]
for (uint32_t j = 0; j < Tc; ++j) {
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
coopmat<float16_t, gl_ScopeWorkgroup, D, Bc, gl_MatrixUseB> K_T;
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC);
S = coopMatMulAdd(Qf16, K_T, S);
if (p.logit_softcap != 0.0f) {
[[unroll]]
for (int k = 0; k < S.length(); ++k) {
S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]);
}
}
if (p.mask != 0) {
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
S += slope*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
}
// Clear padding elements to -inf, so they don't contribute to rowmax
if (Clamp != 0 &&
((j + 1) * Bc > KV ||
(i + 1) * Br > N)) {
uint R = ((i + 1) * Br > N) ? (N % Br) : Br;
uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C);
}
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> rowmax, P, rowsum, eM;
coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce);
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> Mold = M;
// M = max(rowmax, Mold)
// P = e^(S - M)
// eM = e^(Mold - M)
coopMatPerElementNV(M, rowmax, Max, Mold);
coopMatPerElementNV(P, S - M, Exp);
coopMatPerElementNV(eM, Mold - M, Exp);
// Clear padding elements to 0, so they don't contribute to rowsum
if (Clamp != 0 &&
((j + 1) * Bc > KV ||
(i + 1) * Br > N)) {
uint R = ((i + 1) * Br > N) ? (N % Br) : Br;
uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
}
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
// compute rowsum by multiplying by matrix of all ones.
coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
rowsum = coopMatMulAdd(P_A, One, rowsum);
coopmat<float16_t, gl_ScopeWorkgroup, Bc, D, gl_MatrixUseB> V;
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC);
L = eM*L + rowsum;
// This is the "diagonal" matrix in the paper, but since we do componentwise
// multiply rather than matrix multiply it has the diagonal element smeared
// across the row
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> eMdiag;
// resize eM by using smear/reduce
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
O = eMdiag * O;
O = coopMatMulAdd(P_A, V, O);
}
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
// resize L by using smear/reduce
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
[[unroll]]
for (int k = 0; k < Ldiag.length(); ++k) {
Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k];
}
O = Ldiag*O;
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
// permute dimensions
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
uint32_t o_offset = iq3*p.ne2*p.ne1;
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute);
}

View file

@ -8,6 +8,13 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint d_offset; uint d_offset;
float param1; float param2; float param1; float param2;
uint ne0_012mp; uint ne0_012L;
uint ne0_01mp; uint ne0_01L;
uint ne0_0mp; uint ne0_0L;
uint ne1_012mp; uint ne1_012L;
uint ne1_01mp; uint ne1_01L;
uint ne1_0mp; uint ne1_0L;
} p; } p;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -17,22 +24,30 @@ uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
} }
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
uint src0_idx(uint idx) { uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
const uint i02_offset = i02*p.ne01*p.ne00; const uint i02_offset = i02*p.ne01*p.ne00;
const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
} }
uint dst_idx(uint idx) { uint dst_idx(uint idx) {
const uint i13 = idx / (p.ne12*p.ne11*p.ne10); const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
const uint i12_offset = i12*p.ne11*p.ne10; const uint i12_offset = i12*p.ne11*p.ne10;
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
} }

View file

@ -5,7 +5,9 @@
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {float data_a[];}; layout (binding = 0) readonly buffer A {float data_a[];};
layout (binding = 0) readonly buffer A4 {vec4 data_a4[];};
layout (binding = 1) writeonly buffer D {float data_d[];}; layout (binding = 1) writeonly buffer D {float data_d[];};
layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];};
layout (push_constant) uniform parameter { layout (push_constant) uniform parameter {
uint ne; uint ne;
@ -13,17 +15,34 @@ layout (push_constant) uniform parameter {
} p; } p;
void main() { void main() {
const uint idx = gl_GlobalInvocationID.x; // Each invocation handles four consecutive components
const uint idx = gl_GlobalInvocationID.x * 4;
if (idx >= p.ne) { if (idx >= p.ne) {
return; return;
} }
// Check if all four components are in bounds and aligned,
// then use vector loads
if (idx + 3 < p.ne && (p.ne % 4) == 0) {
vec4 result = vec4(0.0f);
[[unroll]] for (uint i = 0; i < p.k_num; i++) {
result += data_a4[(i * p.ne + idx) / 4];
}
data_d4[idx / 4] = result;
} else {
[[unroll]] for (uint j = 0; j < 4; ++j) {
if (idx + j < p.ne) {
float result = 0.0f; float result = 0.0f;
[[unroll]] for (uint i = 0; i < p.k_num; i++) { [[unroll]] for (uint i = 0; i < p.k_num; i++) {
result += data_a[i * p.ne + idx]; result += data_a[i * p.ne + idx + j];
} }
data_d[idx] = result; data_d[idx + j] = result;
}
}
}
} }

View file

@ -0,0 +1,328 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
#extension GL_KHR_memory_scope_semantics : enable
#extension GL_KHR_cooperative_matrix : enable
#extension GL_NV_cooperative_matrix2 : enable
#extension GL_EXT_buffer_reference : enable
#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_vote : enable
#include "types.comp"
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 1) const uint BM = 64;
layout (constant_id = 2) const uint BN = 64;
layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
layout (push_constant) uniform parameter
{
uint M;
uint N;
uint K;
uint stride_a;
uint stride_b;
uint stride_d;
uint batch_stride_a;
uint batch_stride_b;
uint batch_stride_d;
#ifdef MUL_MAT_ID
uint nei0;
uint nei1;
uint nbi1;
uint ne11;
#else
uint k_split;
uint ne02;
uint ne12;
uint broadcast2;
uint broadcast3;
#endif
} p;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
#if QUANT_K > 1
#define DECODEFUNCA , dequantFuncA
#define MAT_A_TYPE float16_t
#include "dequant_funcs_cm2.comp"
#else
#define DECODEFUNCA
#define MAT_A_TYPE A_TYPE
#endif
#define MAT_B_TYPE B_TYPE
#ifdef MUL_MAT_ID
layout (binding = 3) readonly buffer IDS {int data_ids[];};
shared u16vec4 row_ids[3072];
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
B_TYPE b[];
};
uint _ne1;
shared uint _ne1_sh;
B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
const uint row_i = blockCoords[0];
if (row_i >= _ne1) {
return B_TYPE(0.0);
}
const u16vec4 row_idx = row_ids[row_i];
B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];
return ret;
}
D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic)
{
uint dr = ir * BM + r;
uint dc = ic * BN + c;
if (dr < p.M && dc < _ne1) {
uint row_i = dc;
const u16vec4 row_idx = row_ids[row_i];
data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem;
}
return elem;
}
#endif
void main() {
#if defined(DATA_A_IQ4_NL)
init_iq4nl_shmem();
#endif
#ifdef MUL_MAT_ID
const uint expert_idx = gl_GlobalInvocationID.z;
#else
const uint batch_idx = gl_GlobalInvocationID.z;
const uint i13 = batch_idx / p.ne12;
const uint i12 = batch_idx % p.ne12;
const uint i03 = i13 / p.broadcast3;
const uint i02 = i12 / p.broadcast2;
const uint batch_idx_a = i03 * p.ne02 + i02;
#endif
const uint blocks_m = (p.M + BM - 1) / BM;
const uint ir = gl_WorkGroupID.x % blocks_m;
const uint ik = gl_WorkGroupID.x / blocks_m;
const uint ic = gl_WorkGroupID.y;
#ifdef MUL_MAT_ID
// Spread the search across all elements in the first subgroup
if (gl_SubgroupID == 0) {
_ne1 = 0;
uint num_elements = p.nei1 * p.nei0;
for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) {
bool in_range = i < num_elements;
uint ii0 = i % p.nei0;
uint ii1 = i / p.nei0;
uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
uint idx = subgroupBallotExclusiveBitCount(ballot);
if (in_range && id == expert_idx) {
row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
}
_ne1 += subgroupBallotBitCount(ballot);
}
_ne1_sh = _ne1;
}
barrier();
_ne1 = _ne1_sh;
// Workgroup has no work
if (ic * BN >= _ne1) return;
#endif
#ifdef MUL_MAT_ID
uint start_k = 0;
const uint end_k = p.K;
#else
uint start_k = ik * p.k_split;
const uint end_k = min(p.K, (ik + 1) * p.k_split);
#endif
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum;
sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
#ifdef MUL_MAT_ID
uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K;
uint pos_b = 0;
#else
uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K;
uint pos_b = batch_idx * p.batch_stride_b;
#endif
uint stride_a = p.stride_a / QUANT_K;
uint stride_b = p.stride_b;
// Hint to the compiler that values are aligned (want 16B alignment).
// Quants are always block-aligned, no alignment needed.
#if ALIGNED
#if QUANT_K == 1
stride_a &= ~7;
#endif
stride_b &= ~7;
#endif
// Create layouts for both clamped and unclamped accesses
tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
#if QUANT_K > 1
tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K);
#endif
// Use end_k rather than p.K as the dimension because that's what
// we need to bound check against when using split_k
tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k);
tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k);
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M);
tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k);
tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k);
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
#if !defined(MUL_MAT_ID)
// Detect a fast path where all loads are entirely in bounds and no clamping is required
if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 &&
#if QUANT_K == 1
(stride_a % 8) == 0 &&
#endif
(stride_b % 8) == 0 && (start_k % 8) == 0) {
// Hint to the compiler that values are aligned (want 16B alignment)
start_k &= ~7;
stride_b &= ~7;
#if QUANT_K == 1
stride_a &= ~7;
#endif
tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
uint k_iters = (end_k - start_k + BK - 1) / BK;
for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
}
} else
#endif // !defined(MUL_MAT_ID)
{
tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1);
tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1);
[[dont_unroll]]
for (uint block_k = start_k; block_k < end_k; block_k += BK) {
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft;
// Clamping is expensive, so detect different code paths for each combination
// of A and B needing clamping.
bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0;
#ifdef MUL_MAT_ID
bool unclampedB = true;
#else
bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0;
#endif
if (unclampedA && unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
#ifdef MUL_MAT_ID
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
#else
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
#endif
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (unclampedA && !unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (!unclampedA && unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
#ifdef MUL_MAT_ID
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
#else
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
#endif
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (!unclampedA && !unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
}
}
}
// Convert from ACC_TYPE to D_TYPE
coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
#ifdef MUL_MAT_ID
// Call callback to store each element, remapping row through shared memory
coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic);
#else
tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
#endif
}

View file

@ -31,6 +31,8 @@
#include <fcntl.h> #include <fcntl.h>
#endif #endif
#include <vulkan/vulkan_core.h>
#define ASYNCIO_CONCURRENCY 64 #define ASYNCIO_CONCURRENCY 64
std::mutex lock; std::mutex lock;
@ -197,15 +199,17 @@ static uint32_t compile_count = 0;
static std::mutex compile_count_mutex; static std::mutex compile_count_mutex;
static std::condition_variable compile_count_cond; static std::condition_variable compile_count_cond;
void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) { void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) {
std::string name = _name + (fp16 ? "" : "_fp32"); std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32"));
std::string out_fname = join_paths(output_dir, name + ".spv"); std::string out_fname = join_paths(output_dir, name + ".spv");
std::string in_path = join_paths(input_dir, in_fname); std::string in_path = join_paths(input_dir, in_fname);
std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2";
#ifdef _WIN32 #ifdef _WIN32
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
#else #else
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname};
#endif #endif
#ifdef GGML_VULKAN_SHADER_DEBUG_INFO #ifdef GGML_VULKAN_SHADER_DEBUG_INFO
@ -255,7 +259,7 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s
} }
static std::vector<std::future<void>> compiles; static std::vector<std::future<void>> compiles;
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) { void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) {
{ {
// wait until fewer than N compiles are in progress. // wait until fewer than N compiles are in progress.
// 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
@ -266,15 +270,15 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
} }
compile_count++; compile_count++;
} }
compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16)); compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc));
} }
void matmul_shaders(bool fp16, bool matmul_id) { void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) {
std::string load_vec = fp16 ? "8" : "4"; std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4";
std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4"; std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4"; std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}}; std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}};
std::string shader_name = "matmul"; std::string shader_name = "matmul";
if (matmul_id) { if (matmul_id) {
@ -286,21 +290,31 @@ void matmul_shaders(bool fp16, bool matmul_id) {
base_dict["FLOAT16"] = "1"; base_dict["FLOAT16"] = "1";
} }
// Shaders with f16 B_TYPE base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
// Shaders with f16 B_TYPE
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc);
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc);
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc);
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc);
for (const auto& tname : type_names) { for (const auto& tname : type_names) {
std::string data_a_key = "DATA_A_" + to_uppercase(tname); std::string data_a_key = "DATA_A_" + to_uppercase(tname);
// For unaligned, load one at a time for f32/f16, or two at a time for quants // For unaligned, load one at a time for f32/f16, or two at a time for quants
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
// For aligned matmul loads // For aligned matmul loads
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc);
if (tname != "f16" && tname != "f32") {
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc);
}
} }
} }
@ -308,10 +322,49 @@ void process_shaders() {
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl; std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}}; std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
// matmul
for (const auto& fp16 : {false, true}) { for (const auto& fp16 : {false, true}) {
matmul_shaders(fp16, false); for (const auto& matmul_id : {false, true}) {
matmul_shaders(fp16, true); for (const auto& coopmat2 : {false, true}) {
for (const auto& f16acc : {false, true}) {
#if !defined(VK_NV_cooperative_matrix2)
if (coopmat2) {
continue;
} }
#endif
if (coopmat2 && !fp16) {
continue;
}
if (!coopmat2 && f16acc) {
continue;
}
matmul_shaders(fp16, matmul_id, coopmat2, f16acc);
}
}
}
}
#if defined(VK_NV_cooperative_matrix2)
// flash attention
for (const auto& f16acc : {false, true}) {
std::string acctype = f16acc ? "float16_t" : "float";
for (const auto& tname : type_names) {
if (tname == "f32") {
continue;
}
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc);
} else {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc);
}
}
}
#endif
for (const auto& tname : type_names) { for (const auto& tname : type_names) {
// mul mat vec // mul mat vec

View file

@ -951,6 +951,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK", "POOL_2D_BACK",
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"PAD_REFLECT_1D",
"ARANGE", "ARANGE",
"TIMESTEP_EMBEDDING", "TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
@ -984,7 +985,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1046,6 +1047,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)", "pool_2d_back(x)",
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"pad_reflect_1d(x)",
"arange(start, stop, step)", "arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)", "timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
@ -1079,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4098,6 +4100,37 @@ struct ggml_tensor * ggml_pad(
return result; return result;
} }
// ggml_pad_reflect_1d
struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1) {
GGML_ASSERT(p0 >= 0);
GGML_ASSERT(p1 >= 0);
GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
GGML_ASSERT(ggml_is_contiguous(a));
GGML_ASSERT(a->type == GGML_TYPE_F32);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] + p0 + p1,
a->ne[1],
a->ne[2],
a->ne[3]);
int32_t params[] = { p0, p1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_PAD_REFLECT_1D;
result->src[0] = a;
return result;
}
// ggml_arange // ggml_arange
struct ggml_tensor * ggml_arange( struct ggml_tensor * ggml_arange(

View file

@ -896,6 +896,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ROPE_FACTORS_LONG,
MODEL_TENSOR.ROPE_FACTORS_SHORT,
MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K,
@ -1391,6 +1393,7 @@ class RopeScalingType(Enum):
NONE = 'none' NONE = 'none'
LINEAR = 'linear' LINEAR = 'linear'
YARN = 'yarn' YARN = 'yarn'
LONGROPE = 'longrope'
class PoolingType(IntEnum): class PoolingType(IntEnum):

View file

@ -146,6 +146,7 @@ class TensorNameMap:
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
@ -158,6 +159,7 @@ class TensorNameMap:
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j

BIN
glslc.exe

Binary file not shown.

View file

@ -104,6 +104,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
}; };
enum llama_rope_type { enum llama_rope_type {
@ -185,7 +186,8 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE = 0, LLAMA_ROPE_SCALING_TYPE_NONE = 0,
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
LLAMA_ROPE_SCALING_TYPE_YARN = 2, LLAMA_ROPE_SCALING_TYPE_YARN = 2,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
}; };
enum llama_pooling_type { enum llama_pooling_type {
@ -992,6 +994,9 @@ extern "C" {
char * buf, char * buf,
int32_t length); int32_t length);
// Get list of built-in chat templates
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
// //
// Sampling API // Sampling API
// //

View file

@ -645,6 +645,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
case LLAMA_VOCAB_PRE_TYPE_EXAONE: case LLAMA_VOCAB_PRE_TYPE_EXAONE:
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
regex_exprs = { regex_exprs = {
"\\p{N}", "\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

View file

@ -1050,6 +1050,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@ -1563,6 +1565,67 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
}, },
}; };
enum llm_chat_template {
LLM_CHAT_TEMPLATE_CHATML,
LLM_CHAT_TEMPLATE_LLAMA_2,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
LLM_CHAT_TEMPLATE_MISTRAL_V1,
LLM_CHAT_TEMPLATE_MISTRAL_V3,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH,
LLM_CHAT_TEMPLATE_GEMMA,
LLM_CHAT_TEMPLATE_ORION,
LLM_CHAT_TEMPLATE_OPENCHAT,
LLM_CHAT_TEMPLATE_VICUNA,
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
LLM_CHAT_TEMPLATE_DEEPSEEK,
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGML_3,
LLM_CHAT_TEMPLATE_CHATGML_4,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
{ "orion", LLM_CHAT_TEMPLATE_ORION },
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
};
static llm_arch llm_arch_from_string(const std::string & name) { static llm_arch llm_arch_from_string(const std::string & name) {
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
if (kv.second == name) { if (kv.second == name) {
@ -1639,6 +1702,7 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
}; };
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@ -5551,8 +5615,12 @@ static void llm_load_hparams(
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 52: model.type = e_model::MODEL_1B; break;
case 40: model.type = e_model::MODEL_2B; break; case 40: model.type = e_model::MODEL_2B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
@ -6452,6 +6520,9 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
vocab.tokenizer_add_bos = true; vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
} }
@ -7046,7 +7117,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
} }
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@ -7682,7 +7753,13 @@ static bool llm_load_tensors(
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
}
if (n_expert == 0) { if (n_expert == 0) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@ -13600,153 +13677,6 @@ struct llm_build_context {
return gf; return gf;
} }
// ref: https://arxiv.org/abs/2203.03466
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
const int64_t n_embd = hparams.n_embd;
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// scale_res - scale the hidden states for residual connection
const float scale_res = scale_depth/sqrtf(float(n_layer));
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled", -1);
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
{
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
}
// scale the hidden states for residual connection
cur = ggml_scale(ctx0, cur, scale_res);
cb(cur, "hidden_scaled_ffn", -1);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head scaling
const float scale_lmhead = float(n_embd_base)/float(n_embd);
cur = ggml_scale(ctx0, cur, scale_lmhead);
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_minicpm3() { struct ggml_cgraph * build_minicpm3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@ -16845,6 +16775,7 @@ static struct ggml_cgraph * llama_build_graph(
switch (model.arch) { switch (model.arch) {
case LLM_ARCH_LLAMA: case LLM_ARCH_LLAMA:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_GRANITE_MOE:
{ {
@ -16928,10 +16859,6 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_internlm2(); result = llm.build_internlm2();
} break; } break;
case LLM_ARCH_MINICPM:
{
result = llm.build_minicpm();
} break;
case LLM_ARCH_MINICPM3: case LLM_ARCH_MINICPM3:
{ {
result = llm.build_minicpm3(); result = llm.build_minicpm3();
@ -22021,18 +21948,109 @@ int32_t llama_detokenize(
// chat templates // chat templates
// //
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
return LLM_CHAT_TEMPLATES.at(tmpl);
}
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl_contains("<|im_start|>")) {
return LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
} else if (
// catches official 'v1' template
tmpl_contains("' [INST] ' + system_message")
// catches official 'v3' and 'v3-tekken' templates
|| tmpl_contains("[AVAILABLE_TOOLS]")
) {
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
if (tmpl_contains(" [INST]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
} else if (tmpl_contains("\"[INST]\"")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
}
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
} else {
// llama2 template and its variants
// [variant] support system message
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
bool support_system_message = tmpl_contains("<<SYS>>");
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
bool strip_message = tmpl_contains("content.strip()");
if (strip_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
} else if (add_bos_inside_history) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
} else if (support_system_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
} else {
return LLM_CHAT_TEMPLATE_LLAMA_2;
}
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
return LLM_CHAT_TEMPLATE_MONARCH;
} else if (tmpl_contains("<start_of_turn>")) {
return LLM_CHAT_TEMPLATE_GEMMA;
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
// OrionStarAI/Orion-14B-Chat
return LLM_CHAT_TEMPLATE_ORION;
} else if (tmpl_contains("GPT4 Correct ")) {
// openchat/openchat-3.5-0106
return LLM_CHAT_TEMPLATE_OPENCHAT;
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
if (tmpl_contains("SYSTEM: ")) {
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
}
return LLM_CHAT_TEMPLATE_VICUNA;
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
// deepseek-ai/deepseek-coder-33b-instruct
return LLM_CHAT_TEMPLATE_DEEPSEEK;
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
// CohereForAI/c4ai-command-r-plus
return LLM_CHAT_TEMPLATE_COMMAND_R;
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains("[gMASK]sop")) {
// chatglm3-6b
return LLM_CHAT_TEMPLATE_CHATGML_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGML_4;
} else if (tmpl_contains(LU8("<用户>"))) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
return LLM_CHAT_TEMPLATE_MINICPM;
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct
return LLM_CHAT_TEMPLATE_EXAONE_3;
} else if (tmpl_contains("rwkv-world")) {
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
} else if (tmpl_contains("<|start_of_role|>")) {
return LLM_CHAT_TEMPLATE_GRANITE;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
// Simple version of "llama_apply_chat_template" that only works with strings // Simple version of "llama_apply_chat_template" that only works with strings
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser. // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
static int32_t llama_chat_apply_template_internal( static int32_t llama_chat_apply_template_internal(
const std::string & tmpl, const llm_chat_template tmpl,
const std::vector<const llama_chat_message *> & chat, const std::vector<const llama_chat_message *> & chat,
std::string & dest, bool add_ass) { std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
std::stringstream ss; std::stringstream ss;
auto tmpl_contains = [&tmpl](std::string haystack) -> bool { if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
// chatml template // chatml template
for (auto message : chat) { for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@ -22040,16 +22058,59 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|im_start|>assistant\n"; ss << "<|im_start|>assistant\n";
} }
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) { } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
// Official mistral 'v7' template
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
for (auto message : chat) {
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
} else if (role == "user") {
ss << "[INST] " << content << "[/INST]";
}
else {
ss << " " << content << "</s>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
bool is_inside_turn = false;
for (auto message : chat) {
if (!is_inside_turn) {
ss << leading_space << "[INST]" << trailing_space;
is_inside_turn = true;
}
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << content << "\n\n";
} else if (role == "user") {
ss << content << leading_space << "[/INST]";
} else {
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
is_inside_turn = false;
}
}
} else if (
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
// llama2 template and its variants // llama2 template and its variants
// [variant] support system message // [variant] support system message
bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral"; // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
// [variant] space before + after response bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
bool space_around_response = tmpl_contains("' ' + eos_token");
// [variant] add BOS inside history // [variant] add BOS inside history
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]"); bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
// [variant] trim spaces from the input message // [variant] trim spaces from the input message
bool strip_message = tmpl_contains("content.strip()"); bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
// construct the prompt // construct the prompt
bool is_inside_turn = true; // skip BOS at the beginning bool is_inside_turn = true; // skip BOS at the beginning
ss << "[INST] "; ss << "[INST] ";
@ -22070,12 +22131,11 @@ static int32_t llama_chat_apply_template_internal(
} else if (role == "user") { } else if (role == "user") {
ss << content << " [/INST]"; ss << content << " [/INST]";
} else { } else {
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>"; ss << content << "</s>";
is_inside_turn = false; is_inside_turn = false;
} }
} }
// llama2 templates seem to not care about "add_generation_prompt" } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
// Phi 3 // Phi 3
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22084,7 +22144,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>\n"; ss << "<|assistant|>\n";
} }
} else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) { } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
// zephyr template // zephyr template
for (auto message : chat) { for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@ -22092,7 +22152,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>\n"; ss << "<|assistant|>\n";
} }
} else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) { } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history) // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
for (auto message : chat) { for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@ -22101,7 +22161,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<s>assistant\n"; ss << "<s>assistant\n";
} }
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) { } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
// google/gemma-7b-it // google/gemma-7b-it
std::string system_prompt = ""; std::string system_prompt = "";
for (auto message : chat) { for (auto message : chat) {
@ -22123,7 +22183,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<start_of_turn>model\n"; ss << "<start_of_turn>model\n";
} }
} else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) { } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
// OrionStarAI/Orion-14B-Chat // OrionStarAI/Orion-14B-Chat
std::string system_prompt = ""; std::string system_prompt = "";
for (auto message : chat) { for (auto message : chat) {
@ -22143,7 +22203,7 @@ static int32_t llama_chat_apply_template_internal(
ss << message->content << "</s>"; ss << message->content << "</s>";
} }
} }
} else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) { } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
// openchat/openchat-3.5-0106, // openchat/openchat-3.5-0106,
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22157,13 +22217,13 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "GPT4 Correct Assistant:"; ss << "GPT4 Correct Assistant:";
} }
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) { } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
// eachadea/vicuna-13b-1.1 (and Orca variant) // eachadea/vicuna-13b-1.1 (and Orca variant)
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
if (role == "system") { if (role == "system") {
// Orca-Vicuna variant uses a system prefix // Orca-Vicuna variant uses a system prefix
if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) { if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
ss << "SYSTEM: " << message->content << "\n"; ss << "SYSTEM: " << message->content << "\n";
} else { } else {
ss << message->content << "\n\n"; ss << message->content << "\n\n";
@ -22177,7 +22237,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "ASSISTANT:"; ss << "ASSISTANT:";
} }
} else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) { } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
// deepseek-ai/deepseek-coder-33b-instruct // deepseek-ai/deepseek-coder-33b-instruct
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22192,7 +22252,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "### Response:\n"; ss << "### Response:\n";
} }
} else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) { } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
// CohereForAI/c4ai-command-r-plus // CohereForAI/c4ai-command-r-plus
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22207,7 +22267,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
} }
} else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) { } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
// Llama 3 // Llama 3
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22216,7 +22276,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
} }
} else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) { } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
// chatglm3-6b // chatglm3-6b
ss << "[gMASK]" << "sop"; ss << "[gMASK]" << "sop";
for (auto message : chat) { for (auto message : chat) {
@ -22226,7 +22286,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>"; ss << "<|assistant|>";
} }
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) { } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
ss << "[gMASK]" << "<sop>"; ss << "[gMASK]" << "<sop>";
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22235,7 +22295,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "<|assistant|>"; ss << "<|assistant|>";
} }
} else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) { } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22247,7 +22307,7 @@ static int32_t llama_chat_apply_template_internal(
ss << trim(message->content); ss << trim(message->content);
} }
} }
} else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
// DeepSeek-V2 // DeepSeek-V2
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22262,7 +22322,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "Assistant:"; ss << "Assistant:";
} }
} else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) { } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
// EXAONE-3.0-7.8B-Instruct // EXAONE-3.0-7.8B-Instruct
for (auto message : chat) { for (auto message : chat) {
@ -22278,7 +22338,7 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) { if (add_ass) {
ss << "[|assistant|]"; ss << "[|assistant|]";
} }
} else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) { } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
// this template requires the model to have "\n\n" as EOT token // this template requires the model to have "\n\n" as EOT token
for (auto message : chat) { for (auto message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22288,7 +22348,7 @@ static int32_t llama_chat_apply_template_internal(
ss << message->content << "\n\n"; ss << message->content << "\n\n";
} }
} }
} else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) { } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
// IBM Granite template // IBM Granite template
for (const auto & message : chat) { for (const auto & message : chat) {
std::string role(message->role); std::string role(message->role);
@ -22340,7 +22400,11 @@ int32_t llama_chat_apply_template(
} }
std::string formatted_chat; std::string formatted_chat;
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass); llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
return -1;
}
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
if (res < 0) { if (res < 0) {
return res; return res;
} }
@ -22350,6 +22414,15 @@ int32_t llama_chat_apply_template(
return res; return res;
} }
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
auto it = LLM_CHAT_TEMPLATES.begin();
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
output[i] = it->first.c_str();
std::advance(it, 1);
}
return (int32_t) LLM_CHAT_TEMPLATES.size();
}
// //
// sampling // sampling
// //