diff --git a/README-sycl.md b/README-sycl.md index 7aa4274a9..e3a8e726e 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -311,15 +311,13 @@ Output (example): a. Download & install cmake for Windows: https://cmake.org/download/ -b. Download & install make for Windows provided by mingw-w64 +b. Download & install mingw-w64 make for Windows provided by w64devkit -- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases. +- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). - Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z). +- Extract `w64devkit` on your pc. -- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**. - -- Add the **bin** folder path in the Windows system PATH environment. +- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`. ### Build locally: diff --git a/common/common.cpp b/common/common.cpp index 34e62967c..b197498f0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -47,6 +47,10 @@ #define GGML_USE_CUBLAS_SYCL #endif +#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUBLAS_SYCL_VULKAN +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -400,6 +404,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } sparams.penalty_present = std::stof(argv[i]); + } else if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_range = std::stof(argv[i]); + } else if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_exponent = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; @@ -649,8 +665,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n"); +#ifndef GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -943,6 +959,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); + printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); + printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a6ffd128b..829d68368 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -22,6 +22,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +from convert import HfVocab + # check for any of the given keys in the dictionary and return the value of the first key found def get_key_opts(d, keys): @@ -205,6 +207,8 @@ class Model: return OrionModel if model_architecture == "InternLM2ForCausalLM": return InternLM2Model + if model_architecture == "MiniCPMForCausalLM": + return MiniCPMModel return Model def _is_model_safetensors(self) -> bool: @@ -258,6 +262,8 @@ class Model: return gguf.MODEL_ARCH.ORION if arch == "InternLM2ForCausalLM": return gguf.MODEL_ARCH.INTERNLM2 + if arch == "MiniCPMForCausalLM": + return gguf.MODEL_ARCH.MINICPM raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -402,6 +408,31 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_hf(self): + path = self.dir_model + added_tokens_path = self.dir_model + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + class GPTNeoXModel(Model): def set_gguf_parameters(self): @@ -1041,6 +1072,24 @@ class MixtralModel(Model): self._set_vocab_sentencepiece() +class MiniCPMModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def set_vocab(self): + self._set_vocab_hf() + + class QwenModel(Model): @staticmethod def token_bytes_to_string(b): @@ -1416,8 +1465,32 @@ class InternLM2Model(Model): self.gguf_writer.add_add_space_prefix(add_prefix) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + special_vocab.add_to_gguf(self.gguf_writer) + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) @@ -1486,8 +1559,9 @@ class InternLM2Model(Model): qkv = data_torch qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - q = rearrange(q, " o g n i -> o (g n i)").T - k = rearrange(k, " o g n i -> o (g n i)").T + # The model weights of q and k equire additional reshape. + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) v = rearrange(v, " o g n i -> o (g n i)").T self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) diff --git a/convert.py b/convert.py index 75c100118..323e8058d 100755 --- a/convert.py +++ b/convert.py @@ -334,9 +334,9 @@ class Params: class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - try: + if isinstance(self.bpe_tokenizer.get('model'), dict): self.vocab = self.bpe_tokenizer["model"]["vocab"] - except KeyError: + else: self.vocab = self.bpe_tokenizer added_tokens: dict[str, int] if fname_added_tokens is not None: @@ -515,10 +515,14 @@ class HfVocab: # Yield token text, score, and type yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, self.special_ids # Reuse already stored special IDs + token_id, token_text, self.special_ids # Reuse already stored special IDs ) - def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + # Determine token type based on whether it's a special token return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL @@ -530,7 +534,7 @@ class HfVocab: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], self.special_ids) + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 6ac70ba69..031e9806d 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -34,7 +34,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos); + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } @@ -152,20 +152,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ size_t image_pos = prompt.find(""); if (image_pos != std::string::npos) { // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string - size_t pos = 0; - while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { - user_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { - system_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - printf("system_prompt: %s\n", system_prompt.c_str()); printf("user_prompt: %s\n", user_prompt.c_str()); } else { diff --git a/examples/server/README.md b/examples/server/README.md index fe934dab1..1db7cdf21 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -137,6 +137,10 @@ node index.js `temperature`: Adjust the randomness of the generated text (default: 0.8). + `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled). + + `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). + `top_k`: Limit the next token selection to the K most probable tokens (default: 40). `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95). @@ -264,7 +268,23 @@ Notice that each `probs` is an array of length `n_probs`. It also accepts all the options of `/completion` except `stream` and `prompt`. -- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- **GET** `/props`: Return current server settings. + +### Result JSON + +```json +{ + "assistant_name": "", + "user_name": "", + "default_generation_settings": { ... }, + "total_slots": 1 +} +``` + +- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. +- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. +- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp index fe5f81228..f5e696e17 100644 --- a/examples/server/completion.js.hpp +++ b/examples/server/completion.js.hpp @@ -236,214 +236,250 @@ unsigned char completion_js[] = { 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, - 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x24, - 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, - 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x60, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, + 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, + 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, + 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61, + 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62, + 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20, + 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c, + 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, + 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27, + 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, + 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, + 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, + 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, + 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, + 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, - 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, - 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, - 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, - 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, - 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, - 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, - 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, - 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, - 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, - 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, - 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, - 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, - 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, - 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, - 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, - 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, - 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, - 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, - 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, - 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, - 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, - 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, - 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, + 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, + 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, + 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, + 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, + 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, + 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, - 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, - 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, - 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, - 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, - 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, - 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, - 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, - 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, - 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, - 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, - 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, - 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, - 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, - 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, - 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, - 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, - 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, - 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, - 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, - 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, - 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, - 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, - 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, - 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, - 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, - 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, - 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, - 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, + 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, + 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45, + 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, + 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, + 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, + 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f, + 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, + 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, + 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, + 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, + 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, + 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, + 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, + 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, + 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, + 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, + 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, - 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, - 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, - 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, - 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, - 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, - 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, - 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, - 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, - 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, - 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, - 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, - 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, - 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, - 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, - 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, - 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, - 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, - 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, - 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, - 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, - 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, - 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, - 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, - 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, - 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, - 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, - 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, - 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x6d, 0x6f, 0x64, 0x65, - 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29, 0x2e, 0x74, 0x68, 0x65, - 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, - 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, + 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, + 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, + 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, + 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, + 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, + 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, + 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, + 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, + 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, + 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, + 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, + 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, + 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, + 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, + 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, + 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, + 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, + 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, + 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, + 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, + 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, + 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a + 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, + 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, + 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, + 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, + 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, + 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, + 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, + 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, + 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, + 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, + 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, + 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, + 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, + 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, + 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, + 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, + 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, + 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, + 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, + 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, + 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, + 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, + 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, + 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, + 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, + 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, + 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, + 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, + 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, + 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, + 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, + 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, + 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, + 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, + 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, + 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, + 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, + 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, + 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, + 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, + 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, + 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, + 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, + 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, + 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, + 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, + 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, + 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, + 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, + 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, + 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, + 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, + 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, + 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, + 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, + 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, + 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, + 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, + 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, + 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, + 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, + 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, + 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, + 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, + 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, + 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, + 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, 0x29, 0x2e, + 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, + 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, + 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, + 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, + 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, + 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, + 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a }; -unsigned int completion_js_len = 5346; +unsigned int completion_js_len = 5782; diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index baaec1d60..ab38a7b40 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -195,7 +195,8 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async () => { if (!generation_settings) { - generation_settings = await fetch("/model.json").then(r => r.json()); + const props = await fetch("/props").then(r => r.json()); + generation_settings = props.default_generation_settings; } return generation_settings; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b780d7ccf..48a2909dc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -335,6 +335,7 @@ struct llama_server_context // slots / clients std::vector slots; + json default_generation_settings_for_props; llama_server_queue queue_tasks; llama_server_response queue_results; @@ -431,6 +432,9 @@ struct llama_server_context slots.push_back(slot); } + default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props["seed"] = -1; + batch = llama_batch_init(n_ctx, 0, params.n_parallel); // empty system prompt @@ -521,27 +525,29 @@ struct llama_server_context slot->oaicompat_model = ""; } - slot->params.stream = json_value(data, "stream", false); - slot->params.cache_prompt = json_value(data, "cache_prompt", false); - slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); - slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); - slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); - slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); - slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); - slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); - slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot->params.stream = json_value(data, "stream", false); + slot->params.cache_prompt = json_value(data, "cache_prompt", false); + slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); + slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); + slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); + slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); + slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); + slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); + slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); + slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); + slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); + slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); + slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); + slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); + slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); + slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); + slot->params.seed = json_value(data, "seed", default_params.seed); + slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); // infill if (data.count("input_prefix") != 0) @@ -984,11 +990,6 @@ struct llama_server_context queue_results.send(res); } - json get_model_props() - { - return get_formated_generation(slots[0]); - } - json get_formated_generation(llama_client_slot &slot) { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); @@ -999,6 +1000,8 @@ struct llama_server_context {"model", params.model_alias}, {"seed", slot.params.seed}, {"temperature", slot.sparams.temp}, + {"dynatemp_range", slot.sparams.dynatemp_range}, + {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, @@ -1160,13 +1163,30 @@ struct llama_server_context task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests - if (task.data.count("prompt") && task.data.at("prompt").size() > 1) - { - split_multiprompt_task(task_id, task); - } - // otherwise, it's a single-prompt task, we actually queue it - queue_tasks.post(task); + // if there's numbers in the prompt array it will be treated as an array of tokens + if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) { + bool numbers = false; + for (const auto& e : task.data.at("prompt")) { + if (e.is_number()) { + numbers = true; + break; + } + } + + // NOTE: split_multiprompt_task() does not handle a mix of strings and numbers, + // it will completely stall the server. I don't know where the bug for this is. + // + // if there are numbers, it needs to be treated like a single prompt, + // queue_tasks handles a mix of strings and numbers just fine. + if (numbers) { + queue_tasks.post(task); + } else { + split_multiprompt_task(task_id, task); + } + } else { + queue_tasks.post(task); + } } // for multiple images processing @@ -1248,7 +1268,10 @@ struct llama_server_context void split_multiprompt_task(int multitask_id, task_server& multiprompt_task) { int prompt_count = multiprompt_task.data.at("prompt").size(); - assert(prompt_count > 1); + if (prompt_count <= 1) { + send_error(multiprompt_task, "error while handling multiple prompts"); + return; + } // generate all the ID for subtask std::vector subtask_ids(prompt_count); @@ -2615,7 +2638,9 @@ int main(int argc, char **argv) res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", llama.name_user.c_str() }, - { "assistant_name", llama.name_assistant.c_str() } + { "assistant_name", llama.name_assistant.c_str() }, + { "default_generation_settings", llama.default_generation_settings_for_props }, + { "total_slots", llama.params.n_parallel } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }); @@ -2866,12 +2891,6 @@ int main(int argc, char **argv) } }); - svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res) - { - const json data = llama.get_model_props(); - return res.set_content(data.dump(), "application/json; charset=utf-8"); - }); - svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) { return res.set_content("", "application/json; charset=utf-8"); }); diff --git a/expose.cpp b/expose.cpp index ca0101d20..a9a24f7ea 100644 --- a/expose.cpp +++ b/expose.cpp @@ -60,7 +60,7 @@ extern "C" putenv((char*)deviceenv.c_str()); int vulkan_info = inputs.vulkan_info; - vulkandeviceenv = "GGML_VULKAN_DEVICE="+std::to_string(vulkan_info); + vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+std::to_string(vulkan_info); putenv((char*)vulkandeviceenv.c_str()); executable_path = inputs.executable_path; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5034d80f1..53b65e632 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5311,41 +5311,50 @@ template static __global__ void #endif // __CUDA_ARCH__ >= CC_VOLTA } -template -static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { +template +static __global__ void mul_mat_vec_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) { + + const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par; + const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row >= nrows) { + if (row >= nrows_x) { return; } - const int blocks_per_row = ncols / qk; + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; const int blocks_per_warp = vdr * WARP_SIZE / qi; // partial sum for each thread - float tmp = 0.0f; + float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f}; const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; - for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index + for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row_x; i += blocks_per_warp) { + const int ibx = row*blocks_per_row_x + i; // x block index const int iby = i * (qk/QK8_1); // y block index that aligns with ibx const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { + tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs); + } } // sum up partial sums and write back result #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); - } + for (int j = 0; j < ncols_y; ++j) { + tmp[j] = warp_reduce_sum(tmp[j]); - if (threadIdx.x == 0) { - dst[row] = tmp; + if (threadIdx.x == 0) { + dst[j*nrows_dst + row] = tmp[j]; + } } } @@ -6817,121 +6826,56 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa <<>>(vx, y, dst, ncols, nrows); } -static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} +template +static void mul_mat_vec_q_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { -static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK4_1 == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} + GGML_ASSERT(ncols_x % qk == 0); + GGML_ASSERT(ncols_y <= 4); -static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK5_0 == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK5_1 == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); -} - -static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - mul_mat_vec_q - <<>>(vx, vy, dst, ncols, nrows); + switch (ncols_y) { + case 1: + mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + break; + case 2: + mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + break; + case 3: + mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + break; + case 4: + mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + break; + // case 5: + // mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot> + // <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + // break; + // case 6: + // mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot> + // <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + // break; + // case 7: + // mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot> + // <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + // break; + // case 8: + // mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot> + // <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + // break; + default: + GGML_ASSERT(false); + // mul_mat_vec_q<0, qk, qi, block_q_t, vdr, vec_dot> + // <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst); + break; + } } static void ggml_mul_mat_q4_0_q8_1_cuda( @@ -8439,7 +8383,7 @@ static void ggml_cuda_op_mul_mat_q( CUDA_CHECK(cudaGetDevice(&id)); // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + // nrows_dst == nrows of the matrix that the kernel writes into const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { @@ -8570,50 +8514,73 @@ static void ggml_cuda_op_mul_mat_vec_q( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { - GGML_ASSERT(ggml_nrows(src1) == 1); - const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + switch (src0->type) { case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; default: GGML_ASSERT(false); @@ -9943,17 +9910,18 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_CUDA_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); #endif // GGML_CUDA_FORCE_DMMV if (use_mul_mat_vec_q) { - // NOTE: this kernel does not support ggml_nrows(src1) > 1 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); } else { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); } } else { - if (use_mul_mat_q) { + if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + } else if (use_mul_mat_q) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); diff --git a/ggml-impl.h b/ggml-impl.h index 2c58075ac..19df66bce 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -19,6 +19,7 @@ extern "C" { // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef __cplusplus #ifndef static_assert #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) @@ -26,6 +27,7 @@ extern "C" { #define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif +#endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) diff --git a/ggml-quants.c b/ggml-quants.c index d2d411245..62fc366ff 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2383,19 +2383,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri uint8_t L[QK_K]; uint8_t Laux[32]; + uint8_t Ls[QK_K/32]; + uint8_t Lm[QK_K/32]; float weights[32]; - float mins[QK_K/32]; - float scales[QK_K/32]; + float sw[QK_K/32]; + float mins[QK_K/32]; + float scales[QK_K/32]; for (int i = 0; i < nb; i++) { float sum_x2 = 0; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; - float sigma2 = sum_x2/QK_K; + float sigma2 = 2*sum_x2/QK_K; float av_x = sqrtf(sigma2); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { if (quant_weights) { const float * qw = quant_weights + QK_K*i + 32*j; @@ -2403,25 +2404,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri } else { for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); } + float sumw = 0; + for (int l = 0; l < 32; ++l) sumw += weights[l]; + sw[j] = sumw; scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); - //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } } - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw); + float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw); for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); + uint8_t ls = Ls[j]; + uint8_t lm = Lm[j]; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -2431,8 +2424,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + y[i].d = GGML_FP32_TO_FP16(d_block); + y[i].dmin = GGML_FP32_TO_FP16(m_block); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { @@ -2690,20 +2683,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri const int nb = n_per_row / QK_K; uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; - float weights[32]; uint8_t Laux[32]; + uint8_t Ls[QK_K/32]; + uint8_t Lm[QK_K/32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float sw[QK_K/32]; + float weights[32]; for (int i = 0; i < nb; i++) { float sum_x2 = 0; for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; - float sigma2 = sum_x2/QK_K; + float sigma2 = 2*sum_x2/QK_K; float av_x = sqrtf(sigma2); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { if (quant_weights) { const float * qw = quant_weights + QK_K*i + 32*j; @@ -2711,22 +2705,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri } else { for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); } + float sumw = 0; + for (int l = 0; l < 32; ++l) sumw += weights[l]; + sw[j] = sumw; + scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } } - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw); + float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw); + for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); + uint8_t ls = Ls[j]; + uint8_t lm = Lm[j]; ls = MIN(63, ls); lm = MIN(63, lm); if (j < 4) { @@ -2738,8 +2729,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + y[i].d = GGML_FP32_TO_FP16(d_block); + y[i].dmin = GGML_FP32_TO_FP16(m_block); uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { @@ -9050,8 +9041,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict int8_t L[32]; int8_t Laux[32]; float waux[32]; - bool is_on_grid[4]; - bool is_on_grid_aux[4]; uint8_t block_signs[4]; uint32_t q2[2*(QK_K/32)]; @@ -9101,10 +9090,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict memset(L, 0, 32); continue; } + float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight); + float eff_max = scale*kMaxQ; float best = 0; - float scale = max/(2*kMaxQ-1); - for (int is = -9; is <= 9; ++is) { - float id = (2*kMaxQ-1+is*0.1f)/max; + for (int is = -6; is <= 6; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/eff_max; float this_scale = 1/id; for (int k = 0; k < 4; ++k) { for (int i = 0; i < 8; ++i) { @@ -9114,9 +9104,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict uint16_t u = 0; for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); int grid_index = kmap_q2xs[u]; - is_on_grid_aux[k] = true; if (grid_index < 0) { - is_on_grid_aux[k] = false; const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); } @@ -9130,16 +9118,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { scale = sumqx/sumq2; best = scale*sumqx; - for (int i = 0; i < 32; ++i) L[i] = Laux[i]; - for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + memcpy(L, Laux, 32); } } - int n_not_ongrid = 0; - for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; - if (n_not_ongrid > 0 && scale > 0) { + if (scale > 0) { float id = 1/scale; for (int k = 0; k < 4; ++k) { - if (is_on_grid[k]) continue; uint16_t u = 0; for (int i = 0; i < 8; ++i) { int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); @@ -9195,49 +9179,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict float d = max_scale/31; y[ibl].d = GGML_FP32_TO_FP16(d); float id = 1/d; - float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(15, l)); q2[2*ib+1] |= ((uint32_t)l << 28); - const float * xb = xbl + 32*ib; - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib); - const float db = d * (1 + 2*l); - uint32_t u = 0; - for (int k = 0; k < 4; ++k) { - const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127); - const float * xk = xb + 8*k; - const float * wk = weight + 8*k; - const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - float best_mse = 0; int best_index = aux8[k]; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - best_mse += wk[j] * diff * diff; - } - for (int idx = 0; idx < 256; ++idx) { - grid = (const uint8_t *)(kgrid_q2xs + idx); - float mse = 0; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - mse += wk[j] * diff * diff; - } - if (mse < best_mse) { - best_mse = mse; best_index = idx; - } - } - u |= (best_index << 8*k); - grid = (const uint8_t *)(kgrid_q2xs + best_index); - //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - for (int j = 0; j < 8; ++j) { - float q = db * grid[j] * signs[j]; - sumqx += wk[j] * q * xk[j]; - sumq2 += wk[j] * q * q; - } - } - q2[2*ib] = u; - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); } memcpy(y[ibl].qs, q2, QK_K/4); } diff --git a/ggml-quants.h b/ggml-quants.h index 5c9f63bd9..bfdf3c997 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,70 +191,74 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#ifdef __cplusplus +extern "C" { +#endif + // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k); +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k); +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k); +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k); +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k); +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k); -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); -void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k); +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k); +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k); +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k); +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); -void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k); +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); -void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k); +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size); void iq2xs_free_impl(int grid_size); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); + +#ifdef __cplusplus +} +#endif + diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 51445b5e7..a03df4c65 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) { *dsti = *xi; } +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half *xi = (const sycl::half *)cxi; + float *dsti = (float *)cdsti; + + *dsti = *xi; +} + static void cpy_1_i16_i16(const char * cxi, char * cdsti) { const int16_t *xi = (const int16_t *)cxi; int16_t *dsti = (int16_t *)cdsti; @@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) { template static void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, - const sycl::nd_item<3> &item_ct1) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // then combine those indices with the corresponding byte offsets to get the total offsets - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = i - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = i - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; cpy_1(cx + x_offset, cdst + dst_offset); } @@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { template static void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, - const sycl::nd_item<3> &item_ct1) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; @@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, return; } - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = (i - i02*ne01*ne00 - i01*ne00); - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -10599,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10615,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10624,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10640,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10649,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK8_0 == 0); @@ -10661,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK4_0 == 0); @@ -10680,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK4_1 == 0); @@ -10699,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10722,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10731,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10747,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10756,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10772,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -13910,19 +13940,23 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - GGML_ASSERT(src0->ne[3] == 1); + const int64_t ne02 = src0->ne[2]; + const int64_t nb00 = src0->nb[0]; const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(src1->ne[3] == 1); + const int64_t ne12 = src1->ne[2]; + const int64_t nb10 = src1->nb[0]; const int64_t nb11 = src1->nb[1]; const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; @@ -13934,21 +13968,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index]; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 14fb89e09..9e2846ee4 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "ggml.h" #include "ggml-backend-impl.h" @@ -37,6 +38,8 @@ #define GGML_VK_MAX_NODES 8192 +#define MAX_VK_BUFFERS 256 + #ifndef K_QUANTS_PER_ITERATION #define K_QUANTS_PER_ITERATION 1 #else @@ -53,15 +56,68 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA } \ } while (0) -struct vk_buffer { +struct ggml_backend_vk_context; + +struct vk_queue { + uint32_t queue_family_index; + vk::Queue queue; + vk::CommandPool pool; + uint32_t cmd_buffer_idx; + std::vector cmd_buffers; + + vk::PipelineStageFlags stage_flags; +}; + +struct vk_device { + vk::PhysicalDevice physical_device; + vk::PhysicalDeviceProperties properties; + std::string name; + uint64_t max_memory_allocation_size; + bool fp16; + vk::Device device; + uint32_t vendor_id; + vk_queue compute_queue; + vk_queue transfer_queue; + bool single_queue; + uint32_t descriptor_set_mode; + uint32_t subgroup_size; + bool uma; + + ~vk_device() { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "destroy device " << name << std::endl; +#endif + device.destroy(); + } +}; + +struct vk_buffer_struct { vk::Buffer buffer; vk::DeviceMemory device_memory; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; - uint32_t qf_owner; + + ggml_backend_vk_context * ctx; + + std::shared_ptr device; + + ~vk_buffer_struct() { + if (size == 0) { + return; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl; +#endif + + device->device.freeMemory(device_memory); + device->device.destroyBuffer(buffer); + } }; +typedef std::shared_ptr vk_buffer; +typedef std::weak_ptr vk_buffer_ref; + struct vk_subbuffer { vk_buffer buffer; uint64_t offset; @@ -70,6 +126,7 @@ struct vk_subbuffer { struct vk_pipeline { std::string name; + vk::ShaderModule shader_module; vk::DescriptorSetLayout dsl; std::vector descriptor_pools; std::vector descriptor_sets; @@ -82,16 +139,6 @@ struct vk_pipeline { uint32_t align; }; -struct vk_queue { - uint32_t queue_family_index; - vk::Queue queue; - vk::CommandPool pool; - uint32_t cmd_buffer_idx; - std::vector cmd_buffers; - - vk::PipelineStageFlags stage_flags; -}; - struct vk_semaphore { vk::Semaphore s; uint64_t value; @@ -105,20 +152,6 @@ struct vk_submission { typedef std::vector vk_sequence; -struct vk_device { - vk::PhysicalDevice physical_device; - vk::PhysicalDeviceProperties properties; - uint64_t max_memory_allocation_size; - bool fp16; - vk::Device device; - uint32_t vendor_id; - vk_queue compute_queue; - vk_queue transfer_queue; - uint32_t descriptor_set_mode; - uint32_t subgroup_size; - bool uma; -}; - struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -190,13 +223,13 @@ struct ggml_tensor_extra_gpu { size_t ctx_idx; - vk_buffer buffer_gpu; + vk_buffer_ref buffer_gpu; uint64_t offset; void reset() { ready = false; ctx_idx = 0; - buffer_gpu.size = 0; + buffer_gpu.reset(); offset = 0; } }; @@ -210,69 +243,96 @@ struct ggml_vk_garbage_collector { std::vector contexts; }; -typedef void (*ggml_vk_func_t)(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +struct ggml_backend_vk_context { + std::string name; -vk::Instance vk_instance; -vk_device vk_device; -vk_pipeline vk_pipeline_matmul_f32_l, vk_pipeline_matmul_f32_m, vk_pipeline_matmul_f32_s; -vk_pipeline vk_pipeline_matmul_f32_aligned_l, vk_pipeline_matmul_f32_aligned_m, vk_pipeline_matmul_f32_aligned_s; -vk_pipeline vk_pipeline_matmul_f16_l, vk_pipeline_matmul_f16_m, vk_pipeline_matmul_f16_s; -vk_pipeline vk_pipeline_matmul_f16_aligned_l, vk_pipeline_matmul_f16_aligned_m, vk_pipeline_matmul_f16_aligned_s; -vk_pipeline vk_pipeline_matmul_f16_f32_l, vk_pipeline_matmul_f16_f32_m, vk_pipeline_matmul_f16_f32_s; -vk_pipeline vk_pipeline_matmul_f16_f32_aligned_l, vk_pipeline_matmul_f16_f32_aligned_m, vk_pipeline_matmul_f16_f32_aligned_s; -vk_pipeline vk_pipeline_matmul_split_k_reduce; -vk_pipeline vk_pipeline_dequant[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_mul_mat_vec_p021_f16_f32; -vk_pipeline vk_pipeline_mul_mat_vec_nc_f16_f32; -vk_pipeline vk_pipeline_get_rows[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_get_rows_f32[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_mul_f32; -vk_pipeline vk_pipeline_add_f32; -vk_pipeline vk_pipeline_scale_f32; -vk_pipeline vk_pipeline_sqr_f32; -vk_pipeline vk_pipeline_clamp_f32; -vk_pipeline vk_pipeline_cpy_f32_f32, vk_pipeline_cpy_f32_f16, vk_pipeline_cpy_f16_f16; -vk_pipeline vk_pipeline_norm_f32; -vk_pipeline vk_pipeline_rms_norm_f32; -vk_pipeline vk_pipeline_gelu_f32; -vk_pipeline vk_pipeline_silu_f32; -vk_pipeline vk_pipeline_relu_f32; -vk_pipeline vk_pipeline_diag_mask_inf_f32; -vk_pipeline vk_pipeline_soft_max_f32; -vk_pipeline vk_pipeline_rope_f32, vk_pipeline_rope_f16; -vk_pipeline vk_pipeline_rope_neox_f32, vk_pipeline_rope_neox_f16; + std::weak_ptr device; + vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s; + vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s; + vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s; + vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s; + vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s; + vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s; + vk_pipeline pipeline_matmul_split_k_reduce; + vk_pipeline pipeline_dequant[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; + vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; + vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; + vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_mul_f32; + vk_pipeline pipeline_add_f32; + vk_pipeline pipeline_scale_f32; + vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_silu_f32; + vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_diag_mask_inf_f32; + vk_pipeline pipeline_soft_max_f32; + vk_pipeline pipeline_rope_f32, pipeline_rope_f16; + vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; -static size_t vk_semaphore_idx, vk_event_idx; -static ggml_vk_garbage_collector vk_gc; -static std::vector> vk_pinned_memory; -static size_t vk_prealloc_size_qx, vk_prealloc_size_qy, vk_prealloc_size_x, vk_prealloc_size_y, vk_prealloc_size_split_k; -static vk_buffer vk_prealloc_qx, vk_prealloc_qy, vk_prealloc_x, vk_prealloc_y, vk_prealloc_split_k; -static vk::Fence vk_fence; -static vk_buffer vk_staging; -static size_t vk_staging_size; -static size_t vk_staging_offset; -static vk_buffer vk_sync_staging; + size_t semaphore_idx, event_idx; + ggml_vk_garbage_collector gc; + std::vector> pinned_memory; + size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k; + vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k; + vk::Fence fence; + vk_buffer staging; + size_t staging_size; + size_t staging_offset; + vk_buffer sync_staging; -static vk_context * vk_ctx; -static vk_context * vk_transfer_ctx; + vk_buffer buffer_pool[MAX_VK_BUFFERS]; -static bool vk_disable; + vk_context * compute_ctx; + vk_context * transfer_ctx; + + bool disable; + bool initialized; + + size_t idx; +}; + +struct vk_instance { + vk::Instance instance; + + std::vector device_indices; + + std::shared_ptr devices[GGML_VK_MAX_DEVICES]; + ggml_backend_t backends[GGML_VK_MAX_DEVICES]; + ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES]; + ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES]; + bool initialized[GGML_VK_MAX_DEVICES]; +}; #ifdef GGML_VULKAN_CHECK_RESULTS -size_t vk_skip_checks; -size_t vk_output_tensor; +static size_t vk_skip_checks; +static size_t vk_output_tensor; + +static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); #endif -static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { +typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +static bool vk_instance_initialized = false; +static vk_instance vk_instance; + +GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); + +static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; #endif GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - vk_pipeline pipeline; - pipeline.name = name; pipeline.parameter_count = parameter_count; pipeline.push_constant_size = push_constant_size; @@ -280,7 +340,7 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s pipeline.align = align; vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); - vk::ShaderModule shader_module = vk_device.device.createShaderModule(shader_module_create_info); + pipeline.shader_module = ctx->device.lock()->device.createShaderModule(shader_module_create_info); std::vector dsl_binding; std::vector dsl_binding_flags; @@ -301,17 +361,17 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s {}, dsl_binding); descriptor_set_layout_create_info.setPNext(&dslbfci); - pipeline.dsl = vk_device.device.createDescriptorSetLayout(descriptor_set_layout_create_info); + pipeline.dsl = ctx->device.lock()->device.createDescriptorSetLayout(descriptor_set_layout_create_info); // Check if device supports multiple descriptors per pool - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { const uint32_t alloc_count = 2; // Try allocating multiple sets from one pool // This fails on AMD for some reason, so add a fall back to allocating one pool per set vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size); - vk::DescriptorPool pool = vk_device.device.createDescriptorPool(descriptor_pool_create_info); + vk::DescriptorPool pool = ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info); std::vector layouts(alloc_count); for (uint32_t i = 0; i < alloc_count; i++) { @@ -319,24 +379,24 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s } try { vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data()); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); } catch(vk::OutOfPoolMemoryError const&) { - vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; + ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; } - vk_device.device.destroyDescriptorPool(pool); + ctx->device.lock()->device.destroyDescriptorPool(pool); } - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size); - pipeline.descriptor_pools.push_back(vk_device.device.createDescriptorPool(descriptor_pool_create_info)); + pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); } pipeline.descriptor_set_idx = 0; vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr); - pipeline.layout = vk_device.device.createPipelineLayout(pipeline_layout_create_info); + pipeline.layout = ctx->device.lock()->device.createPipelineLayout(pipeline_layout_create_info); std::vector specialization_entries(specialization_constants.size()); @@ -356,41 +416,45 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( vk::PipelineShaderStageCreateFlags(), vk::ShaderStageFlagBits::eCompute, - shader_module, + pipeline.shader_module, entrypoint.c_str(), &specialization_info); vk::ComputePipelineCreateInfo compute_pipeline_create_info( vk::PipelineCreateFlags(), pipeline_shader_create_info, pipeline.layout); - pipeline.pipeline = vk_device.device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; + pipeline.pipeline = ctx->device.lock()->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - return pipeline; + ctx->gc.pipelines.push_back(&pipeline); } -static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uint32_t n) { +static void ggml_vk_destroy_pipeline(ggml_backend_vk_context * ctx, vk_pipeline * pipeline) { + for (auto& pool : pipeline->descriptor_pools) { + ctx->device.lock()->device.destroyDescriptorPool(pool); + } + pipeline->descriptor_pools.clear(); + pipeline->descriptor_sets.clear(); + pipeline->descriptor_set_idx = 0; + + ctx->device.lock()->device.destroyDescriptorSetLayout(pipeline->dsl); + + ctx->device.lock()->device.destroyPipelineLayout(pipeline->layout); + + ctx->device.lock()->device.destroyShaderModule(pipeline->shader_module); + + ctx->device.lock()->device.destroyPipeline(pipeline->pipeline); +} + +static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; + std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; #endif - // Check if gc already contains pipeline before adding it - bool gc_found = false; - for (auto * pl : vk_gc.pipelines) { - if (&pipeline == pl) { - gc_found = true; - break; - } - } - - if (!gc_found) { - vk_gc.pipelines.push_back(&pipeline); - } - if (pipeline.descriptor_sets.size() >= pipeline.descriptor_set_idx + n) { // Enough descriptors are available return; } - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { const uint32_t alloc_count = pipeline.descriptor_set_idx + n - pipeline.descriptor_sets.size(); std::vector layouts(alloc_count); @@ -398,29 +462,29 @@ static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uin layouts[i] = pipeline.dsl; } vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[0], alloc_count, layouts.data()); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); pipeline.descriptor_sets.insert(pipeline.descriptor_sets.end(), sets.begin(), sets.end()); } else { for (uint32_t i = pipeline.descriptor_sets.size(); i < pipeline.descriptor_set_idx + n; i++) { vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size); - pipeline.descriptor_pools.push_back(vk_device.device.createDescriptorPool(descriptor_pool_create_info)); + pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[i], 1, &pipeline.dsl); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); pipeline.descriptor_sets.push_back(sets[0]); } } } -static void ggml_vk_pipeline_cleanup(vk_pipeline& pipeline) { +static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pipeline_cleanup(" << pipeline.name << ")" << std::endl; + std::cerr << "ggml_pipeline_cleanup(" << pipeline.name << ")" << std::endl; #endif pipeline.descriptor_set_idx = 0; } -static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { +static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl; #endif @@ -433,7 +497,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { q.pool, vk::CommandBufferLevel::ePrimary, 1); - const std::vector cmd_buffers = vk_device.device.allocateCommandBuffers(command_buffer_alloc_info); + const std::vector cmd_buffers = ctx->device.lock()->device.allocateCommandBuffers(command_buffer_alloc_info); auto buf = cmd_buffers.front(); q.cmd_buffers.push_back(buf); @@ -442,24 +506,17 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { return buf; } -static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { +static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_submission()" << std::endl; #endif vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(q); + s.buffer = ggml_vk_create_cmd_buffer(ctx, q); s.wait_semaphores = std::move(wait_semaphores); s.signal_semaphores = std::move(signal_semaphores); return s; } -static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_sequence_1()" << std::endl; -#endif - return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) }; -} - static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl; @@ -578,89 +635,89 @@ static uint32_t ggml_vk_find_queue_family_index(std::vectordevice.lock()->device.createCommandPool(command_pool_create_info_compute); q.cmd_buffer_idx = 0; - q.queue = vk_device.device.getQueue(queue_family_index, queue_index); + q.queue = ctx->device.lock()->device.getQueue(queue_family_index, queue_index); q.stage_flags = stage_flags; - - return q; } -static vk_context * ggml_vk_create_context(vk_queue& q) { +static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_context()" << std::endl; #endif - vk_gc.contexts.emplace_back(); - vk_context * result = &vk_gc.contexts[vk_gc.contexts.size() - 1]; + ctx->gc.contexts.emplace_back(); + vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; memset((void *) result, 0, sizeof(vk_context)); - result->idx = vk_gc.contexts.size() - 1; + result->idx = ctx->gc.contexts.size() - 1; result->q = &q; return result; } -static vk_semaphore * ggml_vk_create_binary_semaphore() { +static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; #endif vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = vk_device.device.createSemaphore(ci); - vk_gc.semaphores.push_back({ semaphore, 0 }); - return &vk_gc.semaphores[vk_gc.semaphores.size() - 1]; + vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + ctx->gc.semaphores.push_back({ semaphore, 0 }); + return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; } -static vk_semaphore * ggml_vk_create_timeline_semaphore() { +static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; #endif - if (vk_semaphore_idx >= vk_gc.tl_semaphores.size()) { + if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = vk_device.device.createSemaphore(ci); - vk_gc.tl_semaphores.push_back({ semaphore, 0 }); + vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); } - return &vk_gc.tl_semaphores[vk_semaphore_idx++]; + return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; } -static vk::Event ggml_vk_create_event() { - if (vk_event_idx >= vk_gc.events.size()) { - vk_gc.events.push_back(vk_device.device.createEvent({})); +static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { + if (ctx->event_idx >= ctx->gc.events.size()) { + ctx->gc.events.push_back(ctx->device.lock()->device.createEvent({})); } - return vk_gc.events[vk_event_idx++]; + return ctx->gc.events[ctx->event_idx++]; } -static void ggml_vk_queue_cleanup(vk_queue& q) { +static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_queue_cleanup()" << std::endl; #endif // Requires command buffers to be done - vk_device.device.resetCommandPool(q.pool); + ctx->device.lock()->device.resetCommandPool(q.pool); q.cmd_buffer_idx = 0; } -static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; #endif - GGML_ASSERT(size > 0); + vk_buffer buf = std::make_shared(); - vk_buffer buf; + if (size == 0) { + buf->size = 0; + return buf; + } - buf.size = size; + buf->size = size; vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, @@ -670,11 +727,11 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ nullptr, }; - buf.buffer = vk_device.device.createBuffer(buffer_create_info); + buf->buffer = ctx->device.lock()->device.createBuffer(buffer_create_info); - vk::MemoryRequirements mem_req = vk_device.device.getBufferMemoryRequirements(buf.buffer); + vk::MemoryRequirements mem_req = ctx->device.lock()->device.getBufferMemoryRequirements(buf->buffer); - vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties(); + vk::PhysicalDeviceMemoryProperties mem_props = ctx->device.lock()->physical_device.getMemoryProperties(); uint32_t memory_type_index = UINT32_MAX; @@ -691,30 +748,36 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ } try { - buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index }); + buf->device_memory = ctx->device.lock()->device.allocateMemory({ mem_req.size, memory_type_index }); } catch (const vk::SystemError& e) { // Out of Host/Device memory, clean up buffer - vk_device.device.destroyBuffer(buf.buffer); - buf.size = 0; + ctx->device.lock()->device.destroyBuffer(buf->buffer); + buf->size = 0; throw e; } - buf.memory_property_flags = req_flags; - buf.ptr = nullptr; + buf->memory_property_flags = req_flags; + buf->ptr = nullptr; if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - buf.ptr = vk_device.device.mapMemory(buf.device_memory, 0, VK_WHOLE_SIZE); + buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } - vk_device.device.bindBufferMemory(buf.buffer, buf.device_memory, 0); + ctx->device.lock()->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); - buf.qf_owner = VK_QUEUE_FAMILY_IGNORED; + buf->ctx = ctx; + + buf->device = ctx->device.lock(); + +#ifdef GGML_VULKAN_DEBUG + std::cerr << "Created buffer " << buf->buffer << std::endl; +#endif return buf; } -static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { try { - return ggml_vk_create_buffer(size, req_flags); + return ggml_vk_create_buffer(ctx, size, req_flags); } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -722,14 +785,14 @@ static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlag } } -static vk_buffer ggml_vk_create_buffer_device(size_t size) { +static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { vk_buffer buf; try { - buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } catch (const vk::SystemError& e) { - if (vk_device.uma) { + if (ctx->device.lock()->uma) { // Fall back to host memory type - buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -741,16 +804,7 @@ static vk_buffer ggml_vk_create_buffer_device(size_t size) { } static void ggml_vk_destroy_buffer(vk_buffer& buf) { - if (buf.size == 0) { - return; - } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl; -#endif - - buf.size = 0; - vk_device.device.freeMemory(buf.device_memory); - vk_device.device.destroyBuffer(buf.buffer); + buf.reset(); } static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { @@ -773,7 +827,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) { ); } -static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector&& events, vk::PipelineStageFlags src_stages, vk::PipelineStageFlags dst_stages) { +static void ggml_vk_wait_events(vk_context * ctx, std::vector&& events) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_wait_events()" << std::endl; #endif @@ -781,10 +835,10 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vectors->buffer.waitEvents( events, - src_stages, - dst_stages, + ctx->q->stage_flags, + ctx->q->stage_flags, {}, {}, {} @@ -810,15 +864,15 @@ static bool ggml_vk_build_shader(ggml_type type) { } } -static void ggml_vk_load_shaders() { +static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_load_shaders()" << std::endl; + std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl; #endif // mulmat - std::initializer_list warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size }; - std::initializer_list warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size }; - std::initializer_list warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size }; + std::initializer_list warptile_l = { 128, 128, 128, 16, ctx->device.lock()->subgroup_size * 2, 64, 2, 4, 4, ctx->device.lock()->subgroup_size }; + std::initializer_list warptile_m = { 128, 64, 64, 16, ctx->device.lock()->subgroup_size, 32, 2, 4, 2, ctx->device.lock()->subgroup_size }; + std::initializer_list warptile_s = { ctx->device.lock()->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, ctx->device.lock()->subgroup_size }; std::array l_wg_denoms = {128, 128, 1 }; std::array m_wg_denoms = { 64, 64, 1 }; @@ -828,145 +882,208 @@ static void ggml_vk_load_shaders() { uint32_t m_align = 64; uint32_t s_align = 32; - if (vk_device.fp16) { - vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_len, matmul_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_len, matmul_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline("matmul_f32_s", matmul_f32_s_len, matmul_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline("matmul_f32_aligned_l", matmul_f32_aligned_l_len, matmul_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline("matmul_f32_aligned_m", matmul_f32_aligned_m_len, matmul_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline("matmul_f32_aligned_s", matmul_f32_aligned_s_len, matmul_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + if (ctx->device.lock()->fp16) { + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_len, matmul_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_len, matmul_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_len, matmul_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_len, matmul_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_len, matmul_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_len, matmul_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline("matmul_f16_l", matmul_f16_l_len, matmul_f16_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline("matmul_f16_m", matmul_f16_m_len, matmul_f16_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline("matmul_f16_s", matmul_f16_s_len, matmul_f16_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_len, matmul_f16_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_len, matmul_f16_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_len, matmul_f16_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_len, matmul_f16_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_len, matmul_f16_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_len, matmul_f16_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline("matmul_f16_aligned_l", matmul_f16_aligned_l_len, matmul_f16_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline("matmul_f16_aligned_m", matmul_f16_aligned_m_len, matmul_f16_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline("matmul_f16_aligned_s", matmul_f16_aligned_s_len, matmul_f16_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline("matmul_f16_f32_l", matmul_f16_f32_l_len, matmul_f16_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline("matmul_f16_f32_m", matmul_f16_f32_m_len, matmul_f16_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline("matmul_f16_f32_s", matmul_f16_f32_s_len, matmul_f16_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_len, matmul_f16_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_len, matmul_f16_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_len, matmul_f16_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } else { - vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline("matmul_f32_s", matmul_f32_s_fp32_len, matmul_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline("matmul_f32_aligned_l", matmul_f32_aligned_l_fp32_len, matmul_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline("matmul_f32_aligned_m", matmul_f32_aligned_m_fp32_len, matmul_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline("matmul_f32_aligned_s", matmul_f32_aligned_s_fp32_len, matmul_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_fp32_len, matmul_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_fp32_len, matmul_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_fp32_len, matmul_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_fp32_len, matmul_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline("matmul_f16_l", matmul_f16_l_fp32_len, matmul_f16_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline("matmul_f16_m", matmul_f16_m_fp32_len, matmul_f16_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline("matmul_f16_s", matmul_f16_s_fp32_len, matmul_f16_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_fp32_len, matmul_f16_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_fp32_len, matmul_f16_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_fp32_len, matmul_f16_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_fp32_len, matmul_f16_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_fp32_len, matmul_f16_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_fp32_len, matmul_f16_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline("matmul_f16_aligned_l", matmul_f16_aligned_l_fp32_len, matmul_f16_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline("matmul_f16_aligned_m", matmul_f16_aligned_m_fp32_len, matmul_f16_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline("matmul_f16_aligned_s", matmul_f16_aligned_s_fp32_len, matmul_f16_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline("matmul_f16_f32_l", matmul_f16_f32_l_fp32_len, matmul_f16_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline("matmul_f16_f32_m", matmul_f16_f32_m_fp32_len, matmul_f16_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline("matmul_f16_f32_s", matmul_f16_f32_s_fp32_len, matmul_f16_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_fp32_len, matmul_f16_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_fp32_len, matmul_f16_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_fp32_len, matmul_f16_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); // dequant shaders - vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); - - vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), { 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); // get_rows - vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); - vk_pipeline_mul_mat_vec_p021_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - vk_pipeline_mul_mat_vec_nc_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - vk_pipeline_norm_f32 = ggml_vk_create_pipeline("norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_rms_norm_f32 = ggml_vk_create_pipeline("rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_cpy_f32_f32 = ggml_vk_create_pipeline("cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_cpy_f32_f16 = ggml_vk_create_pipeline("cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_cpy_f16_f16 = ggml_vk_create_pipeline("cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_add_f32 = ggml_vk_create_pipeline("add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_mul_f32 = ggml_vk_create_pipeline("mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_scale_f32 = ggml_vk_create_pipeline("scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_sqr_f32 = ggml_vk_create_pipeline("sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_clamp_f32 = ggml_vk_create_pipeline("clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_gelu_f32 = ggml_vk_create_pipeline("gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_silu_f32 = ggml_vk_create_pipeline("silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_relu_f32 = ggml_vk_create_pipeline("relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_diag_mask_inf_f32 = ggml_vk_create_pipeline("diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_soft_max_f32 = ggml_vk_create_pipeline("soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_rope_f32 = ggml_vk_create_pipeline("rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_f16 = ggml_vk_create_pipeline("rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_neox_f32 = ggml_vk_create_pipeline("rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_neox_f16 = ggml_vk_create_pipeline("rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); } -void ggml_vk_init() { +static void ggml_vk_print_gpu_info(size_t idx) { + GGML_ASSERT(idx < vk_instance.device_indices.size()); + size_t dev_num = vk_instance.device_indices[idx]; #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_init()" << std::endl; + std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl; #endif - static bool initialized = false; + GGML_ASSERT(vk_instance.initialized); - if (initialized) { - return; + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); } - initialized = true; + vk::PhysicalDevice physical_device = devices[dev_num]; + std::vector ext_props = physical_device.enumerateDeviceExtensionProperties(); - const char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); - int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE)); + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceMaintenance3Properties props3; + vk::PhysicalDeviceSubgroupProperties subgroup_props; + props2.pNext = &props3; + props3.pNext = &subgroup_props; + physical_device.getProperties2(&props2); + + const size_t subgroup_size = subgroup_props.subgroupSize; + const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + + bool fp16_storage = false; + bool fp16_compute = false; + + for (auto properties : ext_props) { + if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; + } + } + + const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; + + bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + + vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures(); + + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + device_features2.pNext = nullptr; + device_features2.features = (VkPhysicalDeviceFeatures)device_features; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + VkPhysicalDeviceVulkan12Features vk12_features; + vk12_features.pNext = nullptr; + vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk11_features.pNext = &vk12_features; + + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); + + fp16 = fp16 && vk12_features.shaderFloat16; + + std::string device_name = props2.properties.deviceName.data(); + std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl; + + if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { + std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl; + } +} + +void ggml_vk_instance_init() { + if (vk_instance_initialized) { + return; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_instance_init()" << std::endl; +#endif vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector layers = { @@ -989,12 +1106,55 @@ void ggml_vk_init() { validation_features.setPNext(nullptr); instance_create_info.setPNext(&validation_features); -std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; + std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; #endif - vk_instance = vk::createInstance(instance_create_info); + vk_instance.instance = vk::createInstance(instance_create_info); - vk_device.physical_device = vk_instance.enumeratePhysicalDevices()[dev_num]; - std::vector ext_props = vk_device.physical_device.enumerateDeviceExtensionProperties(); + memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); + + size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); + + // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan + char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); + if (devices_env != nullptr) { + std::string devices(devices_env); + std::replace(devices.begin(), devices.end(), ',', ' '); + + std::stringstream ss(devices); + size_t tmp; + while (ss >> tmp) { + if(tmp >= num_available_devices) { + std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; + throw std::runtime_error("Invalid Vulkan device index"); + } + vk_instance.device_indices.push_back(tmp); + } + } else { + vk_instance.device_indices.push_back(0); + } + + vk_instance_initialized = true; +} + +void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { + GGML_ASSERT(idx < vk_instance.device_indices.size()); + size_t dev_num = vk_instance.device_indices[idx]; +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl; +#endif + ggml_vk_instance_init(); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); + } + + vk_instance.devices[idx] = std::make_shared(); + ctx->device = vk_instance.devices[idx]; + ctx->device.lock()->physical_device = devices[dev_num]; + std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; @@ -1014,18 +1174,18 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; if (maintenance4_support) { subgroup_props.pNext = &props4; } - vk_device.physical_device.getProperties2(&props2); - vk_device.properties = props2.properties; + ctx->device.lock()->physical_device.getProperties2(&props2); + ctx->device.lock()->properties = props2.properties; if (maintenance4_support) { - vk_device.max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); + ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); } else { - vk_device.max_memory_allocation_size = props3.maxMemoryAllocationSize; + ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize; } - vk_device.vendor_id = vk_device.properties.vendorID; - vk_device.subgroup_size = subgroup_props.subgroupSize; - vk_device.uma = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + ctx->device.lock()->vendor_id = ctx->device.lock()->properties.vendorID; + ctx->device.lock()->subgroup_size = subgroup_props.subgroupSize; + ctx->device.lock()->uma = ctx->device.lock()->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; bool fp16_storage = false; bool fp16_compute = false; @@ -1039,31 +1199,31 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; } const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); - bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != NULL; + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; - vk_device.fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + ctx->device.lock()->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - std::vector queue_family_props = vk_device.physical_device.getQueueFamilyProperties(); + std::vector queue_family_props = ctx->device.lock()->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); const float priorities[] = { 1.0f, 1.0f }; - const bool single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; + ctx->device.lock()->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; std::vector device_queue_create_infos; if (compute_queue_family_index != transfer_queue_family_index) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); - } else if(!single_queue) { + } else if(!ctx->device.lock()->single_queue) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); } else { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); } vk::DeviceCreateInfo device_create_info; std::vector device_extensions; - vk::PhysicalDeviceFeatures device_features = vk_device.physical_device.getFeatures(); + vk::PhysicalDeviceFeatures device_features = ctx->device.lock()->physical_device.getFeatures(); VkPhysicalDeviceFeatures2 device_features2; device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; @@ -1080,13 +1240,13 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; - vkGetPhysicalDeviceFeatures2(vk_device.physical_device, &device_features2); + vkGetPhysicalDeviceFeatures2(ctx->device.lock()->physical_device, &device_features2); - vk_device.fp16 = vk_device.fp16 && vk12_features.shaderFloat16; + ctx->device.lock()->fp16 = ctx->device.lock()->fp16 && vk12_features.shaderFloat16; if (!vk11_features.storageBuffer16BitAccess) { - std::cerr << "ggml_vulkan: device does not support 16-bit storage" << std::endl; - GGML_ASSERT(false); + std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; + throw std::runtime_error("Unsupported device"); } device_extensions.push_back("VK_KHR_16bit_storage"); @@ -1095,10 +1255,11 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; device_extensions.push_back("VK_KHR_shader_non_semantic_info"); #endif - if (vk_device.fp16) { + if (ctx->device.lock()->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } - std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | uma: " << vk_device.uma << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl; + ctx->device.lock()->name = ctx->device.lock()->properties.deviceName.data(); + device_create_info = { vk::DeviceCreateFlags(), device_queue_create_infos, @@ -1106,28 +1267,32 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; device_extensions }; device_create_info.setPNext(&device_features2); - vk_device.device = vk_device.physical_device.createDevice(device_create_info); + ctx->device.lock()->device = ctx->device.lock()->physical_device.createDevice(device_create_info); - vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; + ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; // Shaders - ggml_vk_load_shaders(); + ggml_vk_load_shaders(ctx); // Queues - vk_device.compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); - if (!single_queue) { + ggml_vk_create_queue(ctx, ctx->device.lock()->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); + if (!ctx->device.lock()->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; - vk_device.transfer_queue = ggml_vk_create_queue(transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); + ggml_vk_create_queue(ctx, ctx->device.lock()->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); } else { - vk_device.transfer_queue = vk_device.compute_queue; + // TODO: Use pointer or reference to avoid copy + ctx->device.lock()->transfer_queue = ctx->device.lock()->compute_queue; } - vk_fence = vk_device.device.createFence({}); + ctx->fence = ctx->device.lock()->device.createFence({}); - vk_ctx = nullptr; - vk_transfer_ctx = nullptr; + ctx->compute_ctx = nullptr; + ctx->transfer_ctx = nullptr; - vk_disable = false; + ctx->disable = false; + ctx->initialized = true; + + ctx->idx = idx; #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); @@ -1137,7 +1302,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; #endif } -static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { +static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_to_fp16()" << std::endl; #endif @@ -1158,10 +1323,10 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { return nullptr; } - return &vk_pipeline_dequant[type]; + return &ctx->pipeline_dequant[type]; } -static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { +static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif @@ -1182,15 +1347,10 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { return nullptr; } - return &vk_pipeline_dequant_mul_mat_vec_f32[type]; + return &ctx->pipeline_dequant_mul_mat_vec_f32[type]; } -// buffer pool for vulkan -#define MAX_VK_BUFFERS 256 - -static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS]; - -static vk_buffer ggml_vk_pool_malloc(size_t size) { +static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl; #endif @@ -1199,98 +1359,95 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) { int worst_i = -1; size_t worst_size = 0; //largest unused buffer seen so far for (int i = 0; i < MAX_VK_BUFFERS; ++i) { - vk_buffer &b = g_vk_buffer_pool[i]; - if (b.size > 0 && b.size >= size && b.size < best_size) { + vk_buffer &b = ctx->buffer_pool[i]; + if (b != nullptr && b->size >= size && b->size < best_size) { best_i = i; - best_size = b.size; + best_size = b->size; } - if (b.size > 0 && b.size > worst_size) { + if (b != nullptr && b->size > worst_size) { worst_i = i; - worst_size = b.size; + worst_size = b->size; } } if(best_i != -1) { //found the smallest buffer that fits our needs - vk_buffer b = g_vk_buffer_pool[best_i]; - g_vk_buffer_pool[best_i].size = 0; + vk_buffer b = ctx->buffer_pool[best_i]; + ctx->buffer_pool[best_i].reset(); return b; } if(worst_i != -1) { //no buffer that fits our needs, resize largest one to save memory - vk_buffer& b = g_vk_buffer_pool[worst_i]; + vk_buffer& b = ctx->buffer_pool[worst_i]; ggml_vk_destroy_buffer(b); } - return ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } -static void ggml_vk_pool_free(vk_buffer& buffer) { +static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pool_free(" << buffer.size << ")" << std::endl; + std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl; #endif for (int i = 0; i < MAX_VK_BUFFERS; ++i) { - vk_buffer& b = g_vk_buffer_pool[i]; - if (b.size == 0) { + vk_buffer& b = ctx->buffer_pool[i]; + if (b == nullptr) { b = buffer; - // Set owning queue family index to ignored to avoid synchronization on next use - b.qf_owner = VK_QUEUE_FAMILY_IGNORED; return; } } - fprintf(stderr, "WARNING: vk buffer pool full, increase MAX_VK_BUFFERS\n"); + std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl; ggml_vk_destroy_buffer(buffer); } // Returns an available temporary buffer that may only be used temporarily, it will be reused -static vk_buffer ggml_vk_create_buffer_temp(size_t size) { +static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) { // Try to find existing temp buffer with enough capacity - for (auto& buffer : vk_gc.temp_buffers) { - if (buffer.size >= size) { + for (auto& buffer : ctx->gc.temp_buffers) { + if (buffer->size >= size) { return buffer; } } // Otherwise create new buffer - vk_buffer buf = ggml_vk_pool_malloc(size); - vk_gc.temp_buffers.push_back(buf); + vk_buffer buf = ggml_vk_pool_malloc(ctx, size); + ctx->gc.temp_buffers.push_back(buf); return buf; } -static void * ggml_vk_host_malloc(size_t size) { +static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; #endif - vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - if(!(buf.memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { + if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", size/1024.0/1024.0); - buf.size = 0; - vk_device.device.freeMemory(buf.device_memory); - vk_device.device.destroyBuffer(buf.buffer); + ctx->device.lock()->device.freeMemory(buf->device_memory); + ctx->device.lock()->device.destroyBuffer(buf->buffer); return nullptr; } - vk_pinned_memory.push_back(std::make_tuple(buf.ptr, size, buf)); + ctx->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); - return buf.ptr; + return buf->ptr; } -static void ggml_vk_host_free(void* ptr) { +static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) { if (ptr == nullptr) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl; #endif - vk_buffer* buf = nullptr; + vk_buffer buf; size_t index; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); + for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); if (ptr >= addr && ptr < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); + buf = std::get<2>(ctx->pinned_memory[i]); index = i; break; } @@ -1300,28 +1457,28 @@ static void ggml_vk_host_free(void* ptr) { return; } - ggml_vk_destroy_buffer(*buf); + ggml_vk_destroy_buffer(buf); - vk_pinned_memory.erase(vk_pinned_memory.begin() + index); + ctx->pinned_memory.erase(ctx->pinned_memory.begin() + index); } -static void ggml_vk_host_get(const void * ptr, vk_buffer *& buf, size_t& buf_offset) { +static void ggml_vk_host_get(ggml_backend_vk_context * ctx, const void * ptr, vk_buffer& buf, size_t& buf_offset) { buf = nullptr; buf_offset = 0; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); + for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); if (ptr >= addr && ptr < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); + buf = std::get<2>(ctx->pinned_memory[i]); buf_offset = ((const uint8_t *)ptr) - addr; break; } } } -static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) { +static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_queue& q, bool one_time = true) { vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(q); + s.buffer = ggml_vk_create_cmd_buffer(ctx, q); if (one_time) { s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); } else { @@ -1331,7 +1488,7 @@ static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) return s; } -static void ggml_vk_dispatch_pipeline(vk_context * ctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]); @@ -1344,22 +1501,22 @@ static void ggml_vk_dispatch_pipeline(vk_context * ctx, vk_pipeline& pipeline, s GGML_ASSERT(buffers.size() == pipeline.parameter_count); vk::DescriptorSet& descriptor_set = pipeline.descriptor_sets[pipeline.descriptor_set_idx++]; for (uint32_t i = 0; i < pipeline.parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i].buffer.buffer, buffers[i].offset, buffers[i].size}); + descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); } for (uint32_t i = 0; i < pipeline.parameter_count; i++) { write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); } - vk_device.device.updateDescriptorSets(write_descriptor_sets, {}); + ctx->device.lock()->device.updateDescriptorSets(write_descriptor_sets, {}); - ctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); - ctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); - ctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + subctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); + subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); + subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline.layout, 0, { descriptor_set }, {}); - ctx->s->buffer.dispatch(wg0, wg1, wg2); + subctx->s->buffer.dispatch(wg0, wg1, wg2); } static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { @@ -1381,16 +1538,16 @@ static void ggml_vk_ctx_end(vk_context * ctx) { ctx->s = nullptr; } -static void ggml_vk_ctx_begin(vk_context * ctx) { +static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl; #endif - if (ctx->s != nullptr) { - ggml_vk_ctx_end(ctx); + if (subctx->s != nullptr) { + ggml_vk_ctx_end(subctx); } - ctx->seqs.push_back({ ggml_vk_begin_submission(*ctx->q) }); - ctx->s = ctx->seqs[ctx->seqs.size() - 1].data(); + subctx->seqs.push_back({ ggml_vk_begin_submission(ctx, *subctx->q) }); + subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); } static size_t ggml_vk_align_size(size_t width, size_t align) { @@ -1405,14 +1562,14 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect } } -static void ensure_sync_staging_buffer(size_t size) { - if (vk_sync_staging.size < size) { - ggml_vk_destroy_buffer(vk_sync_staging); - vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); +static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { + if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { + ggml_vk_destroy_buffer(ctx->sync_staging); + ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); } } -static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { +static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl; #endif @@ -1423,9 +1580,9 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf; size_t buf_offset; - ggml_vk_host_get(tensor->data, buf, buf_offset); + ggml_vk_host_get(ctx, tensor->data, buf, buf_offset); const uint64_t ne0 = tensor->ne[0]; const uint64_t ne1 = tensor->ne[1]; @@ -1471,21 +1628,21 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } // Staging buffer required - vk_buffer * staging = &vk_staging; - size_t staging_offset = vk_staging_offset; + vk_buffer staging = ctx->staging; + size_t staging_offset = ctx->staging_offset; const size_t copy_size = ts*ne/bs; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; staging_offset = 0; } else { GGML_ASSERT(false); @@ -1494,23 +1651,23 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size VkBufferCopy buf_copy{ staging_offset, offset, copy_size }; - ggml_vk_sync_buffers(ctx); - vkCmdCopyBuffer(ctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); + ggml_vk_sync_buffers(subctx); + vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { // Find longest contiguous slice if (ne1*nb1 == dstnb2) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); } else { for (uint64_t i1 = 0; i1 < ne1; i1++) { if (ne0*nb0/bs == dstnb1) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); } else { const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { - deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); } } } @@ -1519,19 +1676,22 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size } } -static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { +static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; #endif + // Make sure ctx owns the buffer + GGML_ASSERT(dst->ctx == ctx); + // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf = nullptr; size_t buf_offset; - ggml_vk_host_get(src, buf, buf_offset); + ggml_vk_host_get(ctx, src, buf, buf_offset); if (buf != nullptr) { // Memory is pinned, use as staging buffer @@ -1550,8 +1710,8 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } #ifdef GGML_VULKAN_DEBUG @@ -1559,14 +1719,14 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size #endif // Staging buffer required - vk_buffer * staging = &vk_staging; - size_t staging_offset = vk_staging_offset; + vk_buffer staging = ctx->staging; + size_t staging_offset = ctx->staging_offset; const size_t copy_size = width*height; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; staging_offset = 0; } else { GGML_ASSERT(false); @@ -1578,26 +1738,26 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size offset, copy_size}; - ggml_vk_sync_buffers(ctx); - vkCmdCopyBuffer(ctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); + ggml_vk_sync_buffers(subctx); + vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); if (width == spitch) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &subctx->in_memcpys); } else { for (size_t i = 0; i < height; i++) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); } } } -static void ggml_vk_buffer_write_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { +static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; #endif - return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, sync_staging); + return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging); } -static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { +static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl; #endif @@ -1609,39 +1769,42 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, spitch, width, height, true); - ggml_vk_ctx_end(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true); + ggml_vk_ctx_end(subctx); - for (auto& cpy : ctx->in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); } } -static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { +static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl; #endif - ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); + ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1); } -static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { +static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl; #endif GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); - GGML_ASSERT(src->size > 0); + GGML_ASSERT(src != nullptr); + // Make sure ctx owns the buffer + GGML_ASSERT(src->ctx == ctx); + // Check if dst is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf = nullptr; size_t buf_offset; - ggml_vk_host_get(dst, buf, buf_offset); + ggml_vk_host_get(ctx, dst, buf, buf_offset); std::vector slices(1); if (width == spitch && width == dpitch) { @@ -1660,8 +1823,8 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ if (buf != nullptr) { // Memory is pinned, use as staging buffer - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); return; } @@ -1670,30 +1833,30 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ #endif // Fall back to staging buffer - vk_buffer * staging = &vk_staging; + vk_buffer staging = ctx->staging; const size_t copy_size = dpitch * height; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; } else { GGML_ASSERT(false); } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices); - deferred_memcpy(dst, staging->ptr, copy_size, &ctx->out_memcpys); + deferred_memcpy(dst, staging->ptr, copy_size, &subctx->out_memcpys); } -static void ggml_vk_buffer_read_async(vk_context * ctx, vk_buffer* src, size_t offset, void * dst, size_t size, bool sync_staging = false) { - return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, sync_staging); +static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { + return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst, size, size, size, 1, sync_staging); } -static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) { +static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl; #endif @@ -1702,61 +1865,88 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ggml_vk_buffer_read_async(ctx, src, offset, dst, size, true); - ggml_vk_ctx_end(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true); + ggml_vk_ctx_end(subctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } } } -static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl; #endif + // Make sure both buffers are on same ctx + GGML_ASSERT(src->ctx == dst->ctx); + VkBufferCopy bc{ src_offset, dst_offset, size }; vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); } -static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { + if (src->ctx == dst->ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_copy(" << size << ")" << std::endl; + std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl; #endif - VkBufferCopy bc{ src_offset, dst_offset, size }; + // Copy within the device + ggml_backend_vk_context * ctx = src->ctx; - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); - ggml_vk_buffer_copy_async(ctx, dst, dst_offset, src, src_offset, size); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); - vk_device.device.resetFences({ vk_fence }); + VkBufferCopy bc{ src_offset, dst_offset, size }; + + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); + } else { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl; +#endif + // Copy device to device + ggml_backend_vk_context * src_ctx = src->ctx; + ggml_backend_vk_context * dst_ctx = dst->ctx; + + ggml_vk_ensure_sync_staging_buffer(src_ctx, size); + ggml_vk_ensure_sync_staging_buffer(dst_ctx, size); + + // Copy to src staging buffer + ggml_vk_buffer_copy(src_ctx->sync_staging, 0, src, src_offset, size); + // memcpy to dst staging buffer + memcpy(dst_ctx->sync_staging->ptr, src_ctx->sync_staging->ptr, size); + // Copy to dst buffer + ggml_vk_buffer_copy(dst, dst_offset, dst_ctx->sync_staging, 0, size); + } } -static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size) { +static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl; #endif - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); - ggml_vk_ctx_end(ctx); + // Make sure ctx owns the buffer + GGML_ASSERT(dst->ctx == ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_memset waitForFences"); - vk_device.device.resetFences({ vk_fence }); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); + ggml_vk_ctx_end(subctx); + + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); } -static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { +static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl; #endif @@ -1773,20 +1963,20 @@ static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offs const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); if (nb0 == ts && nb1 == row_length) { - return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1); + return ggml_vk_buffer_write_async(ctx, subctx, dst, offset, x, i1*nb1); } if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) { - return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1); + return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, x, nb1, row_length, i1); } GGML_ASSERT(i3 == 0); GGML_ASSERT(i2 == 0); GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src)); - return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src); + return ggml_vk_buffer_write_nc_async(ctx, subctx, dst, offset, src); } -static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst) { +static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl; #endif @@ -1804,10 +1994,10 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs const size_t row_length = ts*ne0/bs; if (ggml_is_contiguous(dst)) { - return ggml_vk_buffer_read_async(ctx, src, offset, dst->data, ne1*nb1*ne2*ne3); + return ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst->data, ne1*nb1*ne2*ne3); } if (nb0 == ts) { - return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3); + return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3); } GGML_ASSERT(false); } @@ -1829,89 +2019,89 @@ static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { return 1; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(int m, int n) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, int m, int n) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; #endif if (m <= 32 || n <= 32) { - return vk_pipeline_matmul_f32_aligned_s.align; + return ctx->pipeline_matmul_f32_aligned_s.align; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { - return vk_pipeline_matmul_f32_aligned_m.align; + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + return ctx->pipeline_matmul_f32_aligned_m.align; } - return vk_pipeline_matmul_f32_aligned_l.align; + return ctx->pipeline_matmul_f32_aligned_l.align; } -static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, int m, int n, bool aligned) { +static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; #endif if (bit16_x && bit16_y) { - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_s : &vk_pipeline_matmul_f16_s; + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_m : &vk_pipeline_matmul_f16_m; + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_l : &vk_pipeline_matmul_f16_l; + return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_s : &vk_pipeline_matmul_f16_f32_s; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_m : &vk_pipeline_matmul_f16_f32_m; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_l : &vk_pipeline_matmul_f16_f32_l; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l; } if (!bit16_x && bit16_y) { GGML_ASSERT(false); } - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_s : &vk_pipeline_matmul_f32_s; + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_m : &vk_pipeline_matmul_f32_m; + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l; + return aligned ? &ctx->pipeline_matmul_f32_aligned_l : &ctx->pipeline_matmul_f32_l; } -static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { +static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; + std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; #endif - ggml_vk_sync_buffers(ctx); + ggml_vk_sync_buffers(subctx); if (split_k == 1) { const std::array pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; - ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); return; } @@ -1919,10 +2109,10 @@ static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer const std::array pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); - ggml_vk_sync_buffers(ctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); + ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { @@ -1932,32 +2122,32 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_type from, ggml_type to) { +static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - return &vk_pipeline_cpy_f32_f32; + return &ctx->pipeline_cpy_f32_f32; } if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - return &vk_pipeline_cpy_f32_f16; + return &ctx->pipeline_cpy_f32_f16; } if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - return &vk_pipeline_cpy_f16_f16; + return &ctx->pipeline_cpy_f16_f16; } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; GGML_ASSERT(false); } -static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { +static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; - std::cerr << "buffer in size=" << in.buffer.size << ", buffer out size=" << out.buffer.size << ")" << std::endl; + std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl; #endif const int tensor_type_size = ggml_type_size(tensor->type); const int dst_type_size = ggml_type_size(buffer_type); const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2]; - const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; + const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; const vk_op_cpy_push_constants pc = { (uint32_t)ne, @@ -1965,11 +2155,11 @@ static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], 1 , (uint32_t)tensor->ne[0] , nb2, 0, }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); } -static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -1998,17 +2188,17 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qx = nullptr; + vk_buffer d_Qx; size_t qx_buf_offset = 0; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; } @@ -2031,12 +2221,12 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, ne01, ne11)); const bool aligned = ne10 == kpad; const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(true, !f16_f32_kernel, ne01, ne11, aligned); + vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); @@ -2044,30 +2234,30 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); - vk_buffer* d_X; + vk_buffer d_X; uint64_t x_buf_offset = 0; - vk_buffer* d_Y; + vk_buffer d_Y; uint64_t y_buf_offset = 0; if (load_x) { - d_Qx = &vk_prealloc_qx; + d_Qx = ctx->prealloc_qx; } else if (!src0_uma) { - d_Qx = &extra_src0->buffer_gpu; + d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if (!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { - d_X = &vk_prealloc_x; + d_X = ctx->prealloc_x; GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03); } else { d_X = d_Qx; @@ -2075,7 +2265,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co GGML_ASSERT(qx_sz == x_sz); // NOLINT } if (qy_needs_dequant) { - d_Y = &vk_prealloc_y; + d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03); } else { d_Y = d_Qy; @@ -2087,49 +2277,49 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co vk_pipeline * to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type); + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type); + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne12 * ne13); if (qx_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_0, x_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, x_non_contig ? 1 : ne12 * ne13); } if (qy_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); } if (split_k > 1) { - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, ne12 * ne13); } if (x_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, dst->type, false); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type, false); } else if (load_x || qx_needs_dequant) { if (load_x) { // copy data to device - ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); - vk_staging_offset = qx_sz * ne02 * ne03; + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); + ctx->staging_offset = qx_sz * ne02 * ne03; } if (qx_needs_dequant) { const std::vector pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *to_fp16_vk_0, { { *d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { *d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } } if (y_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, dst->type); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, dst->type); } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } uint32_t stride_batch_x = ne00*ne01; @@ -2144,16 +2334,16 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co } // compute - ggml_vk_matmul(ctx, *pipeline, { *d_X, x_buf_offset, x_sz * ne02 * ne03 }, { *d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { *d_D, d_buf_offset, d_sz * ne12 * ne13 }, { vk_prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT + ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data); - ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); } } -static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2184,17 +2374,17 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qx = nullptr; + vk_buffer d_Qx; size_t qx_buf_offset = 0; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; } @@ -2214,42 +2404,42 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne11 * ne01; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_X; + vk_buffer d_X; uint64_t x_buf_offset = 0; - vk_buffer* d_Y; + vk_buffer d_Y; uint64_t y_buf_offset = 0; if (load_x) { - d_Qx = &vk_prealloc_qx; + d_Qx = ctx->prealloc_qx; } else if(!src1_uma) { - d_Qx = &extra_src0->buffer_gpu; + d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if(!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { - d_X = &vk_prealloc_x; + d_X = ctx->prealloc_x; } else { d_X = d_Qx; x_buf_offset = qx_buf_offset; GGML_ASSERT(qx_sz == x_sz); } if (qy_needs_dequant) { - d_Y = &vk_prealloc_y; + d_Y = ctx->prealloc_y; } else { d_Y = d_Qy; y_buf_offset = qy_buf_offset; @@ -2259,39 +2449,39 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 vk_pipeline * to_fp16_vk_0 = nullptr; vk_pipeline* to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type); + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } - vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type); + vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); // Allocate descriptor sets if (qx_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_0, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); } - ggml_vk_pipeline_allocate_descriptor_sets(*dmmv, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *dmmv, ne12 * ne13); if (x_non_contig) { - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment)); - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, src0->type); + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment)); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type); } else if (load_x) { // copy data to device - ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, src1->type); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, src1->type); } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } for (uint64_t i13 = 0; i13 < ne13; i13++) { @@ -2306,34 +2496,34 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t y_offset = y_buf_offset + y_sz * it_idx1; const uint64_t d_offset = d_buf_offset + d_sz * it_idx1; - const uint64_t y_buffer_offset = (y_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t y_buffer_offset = (y_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t y_shader_offset = y_offset - y_buffer_offset; - const uint64_t d_buffer_offset = (d_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_offset - d_buffer_offset; if (!y_non_contig && qy_needs_dequant) { const std::vector pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *to_fp16_vk_1, { { *d_Qy, qy_offset, qy_sz }, { *d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1}); } // compute const std::array pc = { (int)ne00, (int)(y_shader_offset / ggml_type_size(src1->type)), (int)(d_shader_offset / ggml_type_size(dst->type))}; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *dmmv, { { *d_X, x_offset, x_sz }, { *d_Y, y_buffer_offset, y_sz + y_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_offset, d, sizeof(float) * d_ne); } } } } -static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2362,13 +2552,13 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2378,51 +2568,51 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor const uint64_t y_ne = ne10 * ne11 * ne12; const uint64_t d_ne = ne01 * ne11 * ne12; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx = &extra_src0->buffer_gpu; + vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if (!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); } // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_p021_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); } // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_mul_mat_vec_p021_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); } } -static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2454,13 +2644,13 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2475,43 +2665,43 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * const uint64_t qy_sz = ggml_nbytes(src1); const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx = &extra_src0->buffer_gpu; + vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); } // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_nc_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); } // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_mul_mat_vec_nc_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); } } @@ -2528,22 +2718,22 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); } -static void ggml_vk_mul_mat(vk_context * ctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; #endif if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst); } else if (src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst); } else { - ggml_vk_mul_mat_q_f16(ctx, src0, src1, dst); + ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst); } } -static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // guaranteed to be an integer due to the check in ggml_can_repeat const uint64_t ne0 = dst->ne[0]; const uint64_t ne1 = dst->ne[1]; @@ -2579,9 +2769,9 @@ static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - const vk_buffer* src_buf = &extra_src0->buffer_gpu; + const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); const uint64_t src_offset = extra_src0->offset; - vk_buffer* dst_buf = &extra->buffer_gpu; + vk_buffer dst_buf = extra->buffer_gpu.lock(); const uint64_t dst_offset = extra->offset; std::vector copies; @@ -2606,78 +2796,79 @@ static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); - (void) src1; + GGML_UNUSED(ctx); + GGML_UNUSED(src1); } -static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { +static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_ADD: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_add_f32; + return &ctx->pipeline_add_f32; } return nullptr; case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (dst->type == GGML_TYPE_F16) { - return &vk_pipeline_get_rows[src0->type]; + return &ctx->pipeline_get_rows[src0->type]; } if (dst->type == GGML_TYPE_F32) { - return &vk_pipeline_get_rows_f32[src0->type]; + return &ctx->pipeline_get_rows_f32[src0->type]; } return nullptr; case GGML_OP_MUL: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_mul_f32; + return &ctx->pipeline_mul_f32; } return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_scale_f32; + return &ctx->pipeline_scale_f32; } return nullptr; case GGML_OP_SQR: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_sqr_f32; + return &ctx->pipeline_sqr_f32; } return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_clamp_f32; + return &ctx->pipeline_clamp_f32; } return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - return ggml_vk_get_cpy_pipeline(src0->type, dst->type); + return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_norm_f32; + return &ctx->pipeline_norm_f32; } return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rms_norm_f32; + return &ctx->pipeline_rms_norm_f32; } return nullptr; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_SILU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_silu_f32; + return &ctx->pipeline_silu_f32; } break; case GGML_UNARY_OP_GELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_gelu_f32; + return &ctx->pipeline_gelu_f32; } break; case GGML_UNARY_OP_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_relu_f32; + return &ctx->pipeline_relu_f32; } break; default: @@ -2686,12 +2877,12 @@ static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_diag_mask_inf_f32; + return &ctx->pipeline_diag_mask_inf_f32; } return nullptr; case GGML_OP_SOFT_MAX: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_soft_max_f32; + return &ctx->pipeline_soft_max_f32; } return nullptr; case GGML_OP_ROPE: @@ -2706,17 +2897,17 @@ static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rope_neox_f32; + return &ctx->pipeline_rope_neox_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &vk_pipeline_rope_neox_f16; + return &ctx->pipeline_rope_neox_f16; } } else { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rope_f32; + return &ctx->pipeline_rope_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &vk_pipeline_rope_f16; + return &ctx->pipeline_rope_f16; } } return nullptr; @@ -2735,13 +2926,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { } } -#ifdef GGML_VULKAN_CHECK_RESULTS -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor); -#endif - template -static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { @@ -2768,7 +2954,7 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm const uint64_t nb2 = dst->nb[2]; const uint64_t nb3 = dst->nb[3]; - vk_pipeline * pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op); + vk_pipeline * pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op); ggml_vk_func_t op_func; if (pipeline == nullptr) { @@ -2782,7 +2968,7 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm GGML_ASSERT(false); } - op_func(ctx, src0, src1, dst); + op_func(ctx, subctx, src0, src1, dst); return; } @@ -2790,19 +2976,19 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - vk_buffer * d_X = nullptr; + vk_buffer d_X = nullptr; size_t x_buf_offset = 0; - vk_buffer * d_Y = nullptr; + vk_buffer d_Y = nullptr; size_t y_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_X, x_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset); src0_uma = d_X != nullptr; if (use_src1) { - ggml_vk_host_get(src1->data, d_Y, y_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset); src1_uma = d_Y != nullptr; } } @@ -2810,30 +2996,31 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; - uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment); - uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0; + uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); + uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; + vk_buffer d_D = extra->buffer_gpu.lock(); + // Workaround for tiny tensor inputs on ROPE - if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) { + if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } - vk_buffer* d_D = &extra->buffer_gpu; GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + uint64_t d_buf_offset = (extra->offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT if (transfer_src0) { - d_X = &vk_prealloc_qx; + d_X = ctx->prealloc_qx; } else if(!src0_uma) { - d_X = &extra_src0->buffer_gpu; + d_X = extra_src0->buffer_gpu.lock(); x_buf_offset = extra_src0->offset; GGML_ASSERT(d_X != nullptr); } if (transfer_src1) { - d_Y = &vk_prealloc_qy; + d_Y = ctx->prealloc_qy; } else if (use_src1 && !src1_uma) { - d_Y = &extra_src1->buffer_gpu; + d_Y = extra_src1->buffer_gpu.lock(); y_buf_offset = extra_src1->offset; GGML_ASSERT(d_Y != nullptr); } @@ -2856,16 +3043,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm // copy src0 to device if (transfer_src0) { - ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0)); - vk_staging_offset = x_sz * ne02 * ne03; + ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0)); + ctx->staging_offset = x_sz * ne02 * ne03; } if (transfer_src1) { - ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1)); } // Single call if dimension 2 is contiguous if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1); switch (dst->op) { case GGML_OP_NORM: @@ -2896,24 +3083,24 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_Y, y_buf_offset, y_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { - ggml_vk_d2h_tensor_2d(ctx, d_D, 0, dst); + ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); } else if(dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); } } else { - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne02 * ne03); switch (dst->op) { case GGML_OP_NORM: @@ -2940,60 +3127,60 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_Y, y_buf_offset + y_offset, y_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); } } } } } -static void ggml_vk_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_get_rows(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_add(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_mul(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_scale(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); } -static void ggml_vk_sqr(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_clamp(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] }); } -static void ggml_vk_cpy(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const int src0_type_size = ggml_type_size(src0->type); const int dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = (extra->offset % vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_CPY, { + const uint32_t d_offset = (extra->offset % ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, @@ -3001,30 +3188,30 @@ static void ggml_vk_cpy(vk_context * ctx, const ggml_tensor * src0, ggml_tensor }); } -static void ggml_vk_norm(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); } -static void ggml_vk_rms_norm(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_unary(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_diag_mask_inf(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } -static void ggml_vk_soft_max(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f }); } -static void ggml_vk_rope(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -3047,19 +3234,19 @@ static void ggml_vk_rope(vk_context * ctx, const ggml_tensor * src0, const ggml_ if (is_neox) { const float theta_scale = powf(freq_base, -2.0f/n_dims); const float inv_ndims = -1.0f / n_dims; - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims }); } else { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f }); } } -static void ggml_vk_nop(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { // If backend is CPU, data from src0 has to be copied off the device if (dst->backend == GGML_BACKEND_CPU) { ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - vk_buffer * d_D = &extra_src0->buffer_gpu; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size); + vk_buffer d_D = extra_src0->buffer_gpu.lock(); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size); } } @@ -3096,7 +3283,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 } template -static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { +static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl; #endif @@ -3108,39 +3295,39 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size std::string shname; if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_s; + p = &ctx->pipeline_matmul_f32_aligned_s; shname = "F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_s; + p = &ctx->pipeline_matmul_f16_f32_aligned_s; shname = "F16_F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_s; + p = &ctx->pipeline_matmul_f16_aligned_s; shname = "F16_ALIGNED_S"; } else { GGML_ASSERT(false); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_m; + p = &ctx->pipeline_matmul_f32_aligned_m; shname = "F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_m; + p = &ctx->pipeline_matmul_f16_f32_aligned_m; shname = "F16_F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_m; + p = &ctx->pipeline_matmul_f16_aligned_m; shname = "F16_ALIGNED_M"; } else { GGML_ASSERT(false); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_l; + p = &ctx->pipeline_matmul_f32_aligned_l; shname = "F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_l; + p = &ctx->pipeline_matmul_f16_f32_aligned_l; shname = "F16_F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_l; + p = &ctx->pipeline_matmul_f16_aligned_l; shname = "F16_ALIGNED_L"; } else { GGML_ASSERT(false); @@ -3154,56 +3341,56 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size if (k != kpad) { if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_s; + p = &ctx->pipeline_matmul_f32_s; shname = "F32_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_s; + p = &ctx->pipeline_matmul_f16_f32_s; shname = "F16_F32_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_s; + p = &ctx->pipeline_matmul_f16_s; shname = "F16_S"; } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_m; + p = &ctx->pipeline_matmul_f32_m; shname = "F32_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_m; + p = &ctx->pipeline_matmul_f16_f32_m; shname = "F16_F32_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_m; + p = &ctx->pipeline_matmul_f16_m; shname = "F16_M"; } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_l; + p = &ctx->pipeline_matmul_f32_l; shname = "F32_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_l; + p = &ctx->pipeline_matmul_f16_f32_l; shname = "F16_F32_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_l; + p = &ctx->pipeline_matmul_f16_l; shname = "F16_L"; } } } - ggml_vk_pipeline_allocate_descriptor_sets(*p, num_it); + ggml_pipeline_allocate_descriptor_sets(ctx, *p, num_it); if (split_k > 1) { - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it); - if (vk_prealloc_split_k.size < sizeof(float) * d_ne * split_k) { + if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer - if (vk_prealloc_split_k.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_split_k); + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); + ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); } } - vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); @@ -3228,26 +3415,26 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size } } - ggml_vk_buffer_write(&d_X, 0, x, sizeof(X_TYPE) * k * m * batch); - ggml_vk_buffer_write(&d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); + ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch); + ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx); - ggml_vk_matmul(ctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(vk_prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); - ggml_vk_ctx_end(ctx); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_matmul(ctx, subctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); + ggml_vk_ctx_end(subctx); } auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast(end-begin).count() / 1000.0; // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne); @@ -3285,14 +3472,14 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size src1_ggml->data = y; tensor_ggml->data = d_chk; - vk_disable = true; + ctx->disable = true; ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_ggml); ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); - vk_disable = false; + ctx->disable = false; ggml_free(ggml_ctx); @@ -3325,7 +3512,7 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size if (split_k > 1) { float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); - ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); + ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); std::cerr << "d_buf0: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); @@ -3345,15 +3532,15 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size free(d_chk); - ggml_vk_queue_cleanup(vk_device.transfer_queue); - ggml_vk_queue_cleanup(vk_device.compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_D); - ggml_vk_pipeline_cleanup(*p); - ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce); + ggml_pipeline_cleanup(*p); + ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce); free(x); free(y); @@ -3392,7 +3579,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 } } -static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) { +static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_t ne1, size_t ne2, size_t ne3) { const size_t ne = ne0 * ne1 * ne2 * ne3; ggml_init_params iparams = { @@ -3406,7 +3593,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) ggml_tensor * tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne2, ne1, ne3); // NOLINT ggml_tensor * result_tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne1, ne2, ne3); - float * data = (float *) ggml_vk_host_malloc(ggml_nbytes(tensor)); + float * data = (float *) ggml_vk_host_malloc(ctx, ggml_nbytes(tensor)); tensor->data = data; float * result_data = (float *) malloc(ggml_nbytes(tensor)); @@ -3426,19 +3613,19 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; } - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); - vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); - ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); + ggml_vk_h2d_tensor_2d(ctx, subctx, buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - ggml_vk_buffer_read(&buffer, 0, result_data, ggml_nbytes(tensor)); + ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor)); double avg_err = 0.0; int first_err_i0 = -1; @@ -3483,22 +3670,22 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) ggml_vk_destroy_buffer(buffer); - ggml_vk_host_free(data); + ggml_vk_host_free(ctx, data); free(result_data); } -static void ggml_vk_test_transfer(size_t ne, bool pinned) { +static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl; #endif // Check transfers are correct - vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); float * x; float * y; if (pinned) { - x = (float *) ggml_vk_host_malloc(sizeof(float) * ne); - y = (float *) ggml_vk_host_malloc(sizeof(float) * ne); + x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); + y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); } else { x = (float *) malloc(sizeof(float) * ne); y = (float *) malloc(sizeof(float) * ne); @@ -3508,42 +3695,42 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { x[i] = rand() / (float)RAND_MAX; } - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_write_async(ctx, &buffer, 0, x, sizeof(float) * ne); + ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne); - for (auto& cpy : ctx->in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->in_memcpys.clear(); + subctx->in_memcpys.clear(); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double ms_to_gpu = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_ctx_begin(ctx); + ggml_vk_ctx_begin(ctx, subctx); begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_read_async(ctx, &buffer, 0, y, sizeof(float) * ne); + ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->out_memcpys.clear(); + subctx->out_memcpys.clear(); end = std::chrono::high_resolution_clock::now(); @@ -3561,15 +3748,15 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { ggml_vk_destroy_buffer(buffer); if (pinned) { - ggml_vk_host_free(x); - ggml_vk_host_free(y); + ggml_vk_host_free(ctx, x); + ggml_vk_host_free(ctx, y); } else { free(x); free(y); } } -static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { +static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl; #endif @@ -3578,8 +3765,8 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); float * x = (float *) malloc(x_sz); void * qx = malloc(qx_sz); - vk_buffer qx_buf = ggml_vk_create_buffer_check(qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer x_buf = ggml_vk_create_buffer_check(x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); for (size_t i = 0; i < ne; i++) { @@ -3588,7 +3775,7 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { std::vector hist_cur(1 << 4, 0); - vk_pipeline& p = vk_pipeline_dequant[quant]; + vk_pipeline& p = ctx->pipeline_dequant[quant]; switch(quant) { case GGML_TYPE_Q4_0: @@ -3625,27 +3812,26 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { GGML_ASSERT(false); } - ggml_vk_pipeline_allocate_descriptor_sets(p, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, p, 1); - ggml_vk_buffer_write(&qx_buf, 0, qx, qx_sz); + ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); const std::vector pc = { 1, (int)ne, (int)ne, (int)ne }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); - ggml_vk_ctx_end(ctx); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_buffer_read(&x_buf, 0, x_chk, x_sz_f16); + ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16); double avg_err = 0.0; for (size_t i = 0; i < ne; i++) { @@ -3687,15 +3873,15 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph return nullptr; } -void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ +static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; + std::cerr << "ggml_ctx->preallocate_buffers_graph(" << node << ")" << std::endl; #endif const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); - if (vk_disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { + if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { return; } @@ -3735,16 +3921,16 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ const uint32_t y_ne = ne10 * ne11; const uint32_t d_ne = ne20 * ne21; - const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; + const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; - if (extra->buffer_gpu.size == 0) { + if (extra->buffer_gpu.expired()) { // Workaround for CPU backend BLAS matmul calls - extra->buffer_gpu = ggml_vk_create_buffer_temp(d_sz); + extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz); } switch (node->op) { @@ -3779,23 +3965,23 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ } break; case GGML_OP_MUL_MAT: - if (vk_prealloc_size_qx < qx_sz) { - vk_prealloc_size_qx = qx_sz; + if (ctx->prealloc_size_qx < qx_sz) { + ctx->prealloc_size_qx = qx_sz; } - if (vk_prealloc_size_qy < qy_sz) { - vk_prealloc_size_qy = qy_sz; + if (ctx->prealloc_size_qy < qy_sz) { + ctx->prealloc_size_qy = qy_sz; } - if (vk_prealloc_size_x < x_sz) { - vk_prealloc_size_x = x_sz; + if (ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; } - if (vk_prealloc_size_y < y_sz) { - vk_prealloc_size_y = y_sz; + if (ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; } - if (vk_prealloc_size_split_k < split_k_size) { - vk_prealloc_size_split_k = split_k_size; + if (ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; } - if (vk_staging_size < x_sz + y_sz) { - vk_staging_size = x_sz + y_sz; + if (ctx->staging_size < x_sz + y_sz) { + ctx->staging_size = x_sz + y_sz; } break; default: @@ -3803,29 +3989,29 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ } } -void ggml_vk_preallocate_buffers() { - if (vk_disable) { +static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { + if (ctx->disable) { return; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers()" << std::endl; - std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl; + std::cerr << "ggml_ctx->preallocate_buffers()" << std::endl; + std::cerr << "qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) - vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - ggml_vk_test_transfer(8192 * 1000, false); - ggml_vk_test_transfer(8192 * 1000, true); + ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ggml_vk_test_transfer(ctx, 8192 * 1000, false); + ggml_vk_test_transfer(ctx, 8192 * 1000, true); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_1); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_1); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q8_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q2_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q3_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q6_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_1); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_1); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q8_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q2_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q3_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q6_K); const std::vector vals { 8, 8, 8, @@ -3852,76 +4038,76 @@ void ggml_vk_preallocate_buffers() { }; const size_t num_it = 1; for (size_t i = 0; i < vals.size(); i += 3) { - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); std::cerr << std::endl; } GGML_ASSERT(false); #endif - if (vk_prealloc_size_qx > 0 && vk_prealloc_qx.size < vk_prealloc_size_qx) { + if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) { // Resize buffer - if (vk_prealloc_qx.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_qx); + if (ctx->prealloc_qx != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_qx); } - vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx); + ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx); } - if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) { + if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) { // Resize buffer - if (vk_prealloc_qy.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_qy); + if (ctx->prealloc_qy != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_qy); } - vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy); + ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy); } - if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) { + if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { // Resize buffer - if (vk_prealloc_x.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_x); + if (ctx->prealloc_x != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_x); } - vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x); + ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x); } - if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) { + if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { // Resize buffer - if (vk_prealloc_y.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_y); + if (ctx->prealloc_y != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_y); } - vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y); + ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y); } - if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) { + if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { // Resize buffer - if (vk_prealloc_split_k.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_split_k); + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k); + ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k); } - if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) { + if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) { // Resize buffer - if (vk_staging.size > 0) { - ggml_vk_destroy_buffer(vk_staging); + if (ctx->staging != nullptr) { + ggml_vk_destroy_buffer(ctx->staging); } - vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); } } -void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ +static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); - if (vk_disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { + if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl; #endif - vk_semaphore_idx = 0; - vk_staging_offset = 0; + ctx->semaphore_idx = 0; + ctx->staging_offset = 0; const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; @@ -3969,44 +4155,44 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ return; } - if (vk_ctx == nullptr) { - vk_ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(vk_ctx); + if (ctx->compute_ctx == nullptr) { + ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, ctx->compute_ctx); } switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(vk_ctx, src0, src1, node); + ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(vk_ctx, src0, src1, node); + ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_ADD: - ggml_vk_add(vk_ctx, src0, src1, node); + ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_MUL: - ggml_vk_mul(vk_ctx, src0, src1, node); + ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_SCALE: - ggml_vk_scale(vk_ctx, src0, node); + ggml_vk_scale(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_SQR: - ggml_vk_sqr(vk_ctx, src0, node); + ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_CLAMP: - ggml_vk_clamp(vk_ctx, src0, node); + ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(vk_ctx, src0, node); + ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_RESHAPE: @@ -4014,15 +4200,15 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NONE: - ggml_vk_nop(vk_ctx, src0, node); + ggml_vk_nop(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_NORM: - ggml_vk_norm(vk_ctx, src0, node); + ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(vk_ctx, src0, node); + ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_UNARY: @@ -4030,26 +4216,26 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_RELU: - ggml_vk_unary(vk_ctx, src0, node); + ggml_vk_unary(ctx, ctx->compute_ctx, src0, node); break; default: return; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(vk_ctx, src0, node); + ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: - ggml_vk_soft_max(vk_ctx, src0, src1, node); + ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: - ggml_vk_rope(vk_ctx, src0, src1, node); + ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(vk_ctx, src0, src1, node); + ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node); break; default: @@ -4057,7 +4243,7 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ } extra->ready = true; - extra->ctx_idx = vk_ctx->idx; + extra->ctx_idx = ctx->compute_ctx->idx; #ifdef GGML_VULKAN_CHECK_RESULTS // Force context reset on each node so that each tensor ends up in its own context @@ -4066,18 +4252,18 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ #endif if (node->backend == GGML_BACKEND_CPU || last_node) { - ggml_vk_ctx_end(vk_ctx); - vk_ctx->exit_tensor = node; - vk_ctx = nullptr; + ggml_vk_ctx_end(ctx->compute_ctx); + ctx->compute_ctx->exit_tensor = node; + ctx->compute_ctx = nullptr; } } -bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); - if (vk_disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { + if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { return false; } @@ -4145,33 +4331,33 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) #endif #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(params, tensor); + ggml_vk_check_results_0(ctx, params, tensor); #endif GGML_ASSERT(extra->ready); - vk_context& ctx = vk_gc.contexts[extra->ctx_idx]; + vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; // Only run if ctx hasn't been submitted yet - if (!ctx.seqs.empty()) { + if (!subctx.seqs.empty()) { // Do staging buffer copies - for (auto& cpy : ctx.in_memcpys) { + for (auto& cpy : subctx.in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(&ctx, vk_fence); + ggml_vk_submit(&subctx, ctx->fence); } - if (tensor == ctx.exit_tensor) { - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + if (tensor == subctx.exit_tensor) { + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); // Do staging buffer copies - for (auto& cpy : ctx.out_memcpys) { + for (auto& cpy : subctx.out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx.in_memcpys.clear(); - ctx.out_memcpys.clear(); + subctx.in_memcpys.clear(); + subctx.out_memcpys.clear(); } extra->ready = false; @@ -4179,90 +4365,204 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) return true; } -void ggml_vk_graph_cleanup() { - if (vk_disable) { +// Clean up after graph processing is done +static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { + if (ctx->disable) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_graph_cleanup()" << std::endl; #endif - for (auto& buffer : vk_gc.temp_buffers) { - ggml_vk_pool_free(buffer); + for (auto& buffer : ctx->gc.temp_buffers) { + ggml_vk_pool_free(ctx, buffer); } - vk_gc.temp_buffers.clear(); + ctx->gc.temp_buffers.clear(); - for (auto * pipeline : vk_gc.pipelines) { - ggml_vk_pipeline_cleanup(*pipeline); - } - vk_gc.pipelines.clear(); - - ggml_vk_queue_cleanup(vk_device.compute_queue); - ggml_vk_queue_cleanup(vk_device.transfer_queue); - - for (size_t i = 0; i < vk_gc.semaphores.size(); i++) { - vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s }); - } - vk_gc.semaphores.clear(); - - for (size_t i = 0; i < vk_gc.tl_semaphores.size(); i++) { - vk_device.device.destroySemaphore({ vk_gc.tl_semaphores[i].s }); - } - vk_gc.tl_semaphores.clear(); - - vk_event_idx = 0; - - for (auto& event : vk_gc.events) { - vk_device.device.resetEvent(event); + for (auto * pipeline : ctx->gc.pipelines) { + ggml_pipeline_cleanup(*pipeline); } - vk_staging_offset = 0; + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); - vk_ctx = nullptr; - vk_gc.contexts.clear(); + for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { + ctx->device.lock()->device.destroySemaphore({ ctx->gc.semaphores[i].s }); + } + ctx->gc.semaphores.clear(); + + for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { + ctx->device.lock()->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); + } + ctx->gc.tl_semaphores.clear(); + ctx->semaphore_idx = 0; + + ctx->event_idx = 0; + + for (auto& event : ctx->gc.events) { + ctx->device.lock()->device.resetEvent(event); + } + + ctx->staging_offset = 0; + + ctx->compute_ctx = nullptr; + ctx->transfer_ctx = nullptr; + ctx->gc.contexts.clear(); } -static void ggml_vk_cleanup() { +// Clean up on backend free +static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_cleanup()" << std::endl; + std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl; #endif - ggml_vk_destroy_buffer(vk_prealloc_x); - ggml_vk_destroy_buffer(vk_prealloc_y); - ggml_vk_destroy_buffer(vk_prealloc_split_k); - ggml_vk_destroy_buffer(vk_staging); - ggml_vk_destroy_buffer(vk_sync_staging); + ggml_vk_graph_cleanup(ctx); - vk_prealloc_size_x = 0; - vk_prealloc_size_y = 0; - vk_prealloc_size_split_k = 0; - vk_staging_size = 0; + ggml_vk_destroy_buffer(ctx->prealloc_qx); + ggml_vk_destroy_buffer(ctx->prealloc_qy); + ggml_vk_destroy_buffer(ctx->prealloc_x); + ggml_vk_destroy_buffer(ctx->prealloc_y); + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + ggml_vk_destroy_buffer(ctx->staging); + ggml_vk_destroy_buffer(ctx->sync_staging); - for (auto& event : vk_gc.events) { - vk_device.device.destroyEvent(event); + for (auto& buffer : ctx->buffer_pool) { + ggml_vk_destroy_buffer(buffer); } - vk_gc.events.clear(); + + ctx->prealloc_size_qx = 0; + ctx->prealloc_size_qy = 0; + ctx->prealloc_size_x = 0; + ctx->prealloc_size_y = 0; + ctx->prealloc_size_split_k = 0; + ctx->staging_size = 0; + + for (auto& event : ctx->gc.events) { + ctx->device.lock()->device.destroyEvent(event); + } + ctx->gc.events.clear(); + + for (auto* pipeline : ctx->gc.pipelines) { + ggml_vk_destroy_pipeline(ctx, pipeline); + } + ctx->gc.pipelines.clear(); + + ctx->device.lock()->device.destroyFence(ctx->fence); + + ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool); + if (!ctx->device.lock()->single_queue) { + ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool); + } +} + +GGML_CALL int ggml_vk_get_device_count() { + ggml_vk_instance_init(); + + return vk_instance.device_indices.size(); +} + +GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) { + ggml_vk_instance_init(); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + vk::PhysicalDeviceProperties props; + devices[device].getProperties(&props); + + snprintf(description, description_size, "%s", props.deviceName.data()); +} + +// CPU assist interface + +void ggml_vk_init_cpu_assist() { + ggml_vk_instance_init(); + + std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl; + + for (size_t i = 0; i < ggml_vk_get_device_count(); i++) { + ggml_vk_print_gpu_info(i); + } + // Initialize the first backend to make sure CPU matrix multiplications can be offloaded. + ggml_backend_vk_init(0); +} + +void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_preallocate_buffers_graph(ctx, node); +} + +void ggml_vk_preallocate_buffers_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_preallocate_buffers(ctx); +} + +void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_build_graph(ctx, node, last_node); +} + +bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){ + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return false; + } + + return ggml_vk_compute_forward(ctx, params, tensor); +} + +void ggml_vk_graph_cleanup_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_graph_cleanup(ctx); +} + +void ggml_vk_free_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized || vk_instance.backends[0] == nullptr) { + return; + } + + ggml_backend_vk_free(vk_instance.backends[0]); } // backend interface #define UNUSED GGML_UNUSED -struct ggml_backend_vk_context { - std::string name; -}; - // device backend static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT struct ggml_backend_vk_buffer_context { + ggml_backend_vk_context * ctx; vk_buffer dev_buffer; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; std::string name; - ggml_backend_vk_buffer_context(vk_buffer dev_buffer) : + ggml_backend_vk_buffer_context(ggml_backend_vk_context * ctx, vk_buffer&& dev_buffer, std::string& name) : + ctx(ctx), dev_buffer(dev_buffer), - name(GGML_VK_NAME) { + name(name) { } ~ggml_backend_vk_buffer_context() { @@ -4294,6 +4594,9 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { } GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl; +#endif ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -4313,6 +4616,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra; extra->buffer_gpu = extra_view->buffer_gpu; extra->offset = extra_view->offset + tensor->view_offs; @@ -4331,11 +4635,13 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_write(&extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(buffer); + ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size); } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -4344,31 +4650,35 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(buffer); + ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size); } GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_vk_buffer_copy(&src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); + vk_buffer src_buf = src_extra->buffer_gpu.lock(); + vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + + ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); return true; } return false; - - UNUSED(buffer); } GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size); + ggml_vk_buffer_memset(ctx->ctx, ctx->dev_buffer, 0, value, buffer->size); } static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { @@ -4386,6 +4696,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { // vk buffer type struct ggml_backend_vk_buffer_type_context { std::string name; + ggml_backend_vk_context * ctx; }; GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { @@ -4398,25 +4709,22 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl; #endif - vk_buffer dev_buffer = ggml_vk_create_buffer_device(size); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size); - ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer); + ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name); - return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, ctx, size); - - UNUSED(buft); + return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_device.properties.limits.minStorageBufferOffsetAlignment; - - UNUSED(buft); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - return vk_device.max_memory_allocation_size; - - UNUSED(buft); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->ctx->device.lock()->max_memory_allocation_size; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { @@ -4426,9 +4734,14 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_ } GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_vk(backend); + if (!ggml_backend_is_vk(backend)) { + return false; + } - UNUSED(buft); + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return buft_ctx->ctx->idx == ctx->idx; } static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { @@ -4441,20 +4754,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type() { - static ggml_backend_buffer_type ggml_backend_vk_buffer_type; +GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl; +#endif - static bool ggml_backend_vk_buffer_type_initialized = false; + GGML_ASSERT(idx < vk_instance.device_indices.size()); - if (!ggml_backend_vk_buffer_type_initialized) { - ggml_backend_vk_buffer_type = { - /* .iface = */ ggml_backend_vk_buffer_type_interface, - /* .context = */ new ggml_backend_vk_buffer_type_context{GGML_VK_NAME}, - }; - ggml_backend_vk_buffer_type_initialized = true; - } + ggml_backend_vk_init(idx); - return &ggml_backend_vk_buffer_type; + return &vk_instance.buffer_types[idx]; } // host buffer type @@ -4472,13 +4781,19 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff } GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_vk_host_free(buffer->context); +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl; +#endif + ggml_vk_host_free(&vk_instance.contexts[0], buffer->context); } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl; +#endif void * ptr = nullptr; try { - ptr = ggml_vk_host_malloc(size); + ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size); } catch (vk::SystemError& e) { std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -4495,7 +4810,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu } GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_device.properties.limits.minMemoryMapAlignment; + return vk_instance.contexts[0].device.lock()->properties.limits.minMemoryMapAlignment; UNUSED(buft); } @@ -4514,127 +4829,150 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .context = */ nullptr, }; + if (!vk_instance.contexts[0].initialized) { + // Fall back to CPU + return ggml_backend_cpu_buffer_type(); + } + return &ggml_backend_vk_buffer_type_host; } // backend GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { - ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - return vk_ctx->name.c_str(); + return ctx->name.c_str(); } GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { - ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl; +#endif - delete vk_ctx; + size_t idx = ctx->idx; + + ggml_vk_cleanup(ctx); + + // Release device + vk_instance.devices[ctx->idx].reset(); + ctx->initialized = false; + + vk_instance.initialized[idx] = false; + vk_instance.backends[idx] = nullptr; + memset(&vk_instance.buffer_types[idx], 0, sizeof(ggml_backend_buffer_type)); delete backend; } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_vk_buffer_type(); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - UNUSED(backend); + GGML_ASSERT(ctx->initialized); + + return ggml_backend_vk_buffer_type(ctx->idx); } GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_write_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(backend); + ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_read_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(backend); + ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl; #endif - if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_copy_async(vk_transfer_ctx, &src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); + vk_buffer src_buf = src_extra->buffer_gpu.lock(); + vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + + ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src)); return true; } return false; - - UNUSED(backend); } GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_synchronize()" << std::endl; #endif - if(vk_transfer_ctx == nullptr) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if(ctx->transfer_ctx == nullptr) { return; } - ggml_vk_ctx_end(vk_transfer_ctx); + ggml_vk_ctx_end(ctx->transfer_ctx); - for (auto& cpy : vk_transfer_ctx->in_memcpys) { + for (auto& cpy : ctx->transfer_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(vk_transfer_ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(ctx->transfer_ctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : vk_transfer_ctx->out_memcpys) { + for (auto& cpy : ctx->transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - vk_transfer_ctx = nullptr; - - UNUSED(backend); + ctx->transfer_ctx = nullptr; } GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - // ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); + ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]); } - ggml_vk_preallocate_buffers(); + ggml_vk_preallocate_buffers(ctx); int last_node = cgraph->n_nodes - 1; @@ -4644,7 +4982,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml } for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(cgraph->nodes[i], i == last_node); + ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); } ggml_compute_params params = {}; @@ -4657,19 +4995,19 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml continue; } - bool ok = ggml_vk_compute_forward(¶ms, node); + bool ok = ggml_vk_compute_forward(ctx, ¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } #ifdef GGML_VULKAN_CHECK_RESULTS else { - ggml_vk_check_results_1(¶ms, node); + ggml_vk_check_results_1(ctx, ¶ms, node); } #endif GGML_ASSERT(ok); } - ggml_vk_graph_cleanup(); + ggml_vk_graph_cleanup(ctx); return true; @@ -4734,7 +5072,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return false; } break; - // case GGML_OP_DUP: + case GGML_OP_DUP: // case GGML_OP_REPEAT: // { // ggml_type src0_type = op->src[0]->type; @@ -4786,18 +5124,30 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .supports_op = */ ggml_backend_vk_supports_op, }; -GGML_CALL ggml_backend_t ggml_backend_vk_init() { - ggml_vk_init(); // TODO: remove from ggml.c +GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { + if (vk_instance.initialized[idx]) { + return vk_instance.backends[idx]; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl; +#endif - ggml_backend_vk_context * ctx = new ggml_backend_vk_context { - /* .name = */ GGML_VK_NAME, + ggml_backend_vk_context * ctx = &vk_instance.contexts[idx]; + ggml_vk_init(ctx, idx); + ctx->name = GGML_VK_NAME + std::to_string(idx); + vk_instance.buffer_types[idx] = { + /* .iface = */ ggml_backend_vk_buffer_type_interface, + /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx }, }; + vk_instance.initialized[idx] = true; ggml_backend_t vk_backend = new ggml_backend { /* .interface = */ ggml_backend_vk_interface, - /* .context = */ ctx + /* .context = */ &vk_instance.contexts[ctx->idx], }; + vk_instance.backends[idx] = vk_backend; + return vk_backend; } @@ -4805,20 +5155,47 @@ GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_vk_name; } +GGML_CALL int ggml_backend_vk_get_device_count() { + return ggml_vk_get_device_count(); +} + +GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { + ggml_vk_get_device_description(device, description, description_size); +} + +GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { + GGML_ASSERT(device < vk_instance.device_indices.size()); + + vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; + + vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); + + for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; + *free = heap.size; + break; + } + } +} + // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) { - ggml_backend_t vk_backend = ggml_backend_vk_init(); + ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data); return vk_backend; UNUSED(params); - UNUSED(user_data); } extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); GGML_CALL int ggml_backend_vk_reg_devices() { - ggml_backend_register(GGML_VK_NAME, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(), nullptr); - return 1; + for (auto idx : vk_instance.device_indices) { + char name[128]; + snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx); + ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx); + } + return vk_instance.device_indices.size(); } // checks @@ -4874,7 +5251,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } } -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { +static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; if (tensor->backend == GGML_BACKEND_GPU) { @@ -4883,7 +5260,7 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -4944,7 +5321,7 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) { +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { if (params->ith != 0) { return; } @@ -4966,7 +5343,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * /*.no_alloc =*/ false, }; - struct ggml_context * ctx = ggml_init(iparams); + struct ggml_context * ggml_ctx = ggml_init(iparams); struct ggml_tensor * src0_clone = nullptr; struct ggml_tensor * src1_clone = nullptr; @@ -4979,7 +5356,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * void * src1_buffer; if (src0 != nullptr) { - src0_clone = ggml_dup_tensor(ctx, src0); + src0_clone = ggml_dup_tensor(ggml_ctx, src0); src0_size = ggml_nbytes(src0); @@ -4995,7 +5372,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { const int idx = i3*src0->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); } } @@ -5005,10 +5382,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1]; } } else { - if (offset + src0_size >= extra->buffer_gpu.size) { - src0_size = extra->buffer_gpu.size - offset; + if (offset + src0_size >= extra->buffer_gpu->size) { + src0_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src0_clone->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5016,13 +5393,13 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(src0, "src0"); + ggml_vk_print_tensor(ctx, src0, "src0"); } ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone); } if (src1 != nullptr) { - src1_clone = ggml_dup_tensor(ctx, src1); + src1_clone = ggml_dup_tensor(ggml_ctx, src1); src1_size = ggml_nbytes(src1); @@ -5038,7 +5415,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { const int idx = i3*src1->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); } } @@ -5048,10 +5425,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1]; } } else { - if (offset + src1_size >= extra->buffer_gpu.size) { - src1_size = extra->buffer_gpu.size - offset; + if (offset + src1_size >= extra->buffer_gpu->size) { + src1_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src1_clone->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5059,7 +5436,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(src1, "src1"); + ggml_vk_print_tensor(ctx, src1, "src1"); std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; if (src1->src[0] != nullptr) { @@ -5082,51 +5459,51 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (tensor->op == GGML_OP_MUL_MAT) { - tensor_clone = ggml_mul_mat(ctx, src0_clone, src1_clone); + tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_MUL) { - tensor_clone = ggml_mul(ctx, src0_clone, src1_clone); + tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_SCALE) { - tensor_clone = ggml_scale(ctx, src0_clone, ((float *)tensor->op_params)[0]); + tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); } else if (tensor->op == GGML_OP_SQR) { - tensor_clone = ggml_sqr(ctx, src0_clone); + tensor_clone = ggml_sqr(ggml_ctx, src0_clone); } else if (tensor->op == GGML_OP_CLAMP) { - tensor_clone = ggml_clamp(ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); } else if (tensor->op == GGML_OP_ADD) { - tensor_clone = ggml_add(ctx, src0_clone, src1_clone); + tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { - tensor_clone = ggml_norm(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_RMS_NORM) { - tensor_clone = ggml_rms_norm(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { if (src1 != nullptr) { - tensor_clone = ggml_soft_max_ext(ctx, src0_clone, src1_clone, *(float *)tensor->op_params); + tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, *(float *)tensor->op_params); } else { - tensor_clone = ggml_soft_max(ctx, src0_clone); + tensor_clone = ggml_soft_max(ggml_ctx, src0_clone); } } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { - tensor_clone = ggml_diag_mask_inf(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_ROPE) { const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + const int n_ggml_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ggml_ctx = ((int32_t *) tensor->op_params)[4]; float freq_base = ((float *) tensor->op_params)[5]; float freq_scale = ((float *) tensor->op_params)[6]; float ext_factor = ((float *) tensor->op_params)[7]; float attn_factor = ((float *) tensor->op_params)[8]; float beta_fast = ((float *) tensor->op_params)[9]; float beta_slow = ((float *) tensor->op_params)[10]; - tensor_clone = ggml_rope_custom(ctx, src0_clone, src1_clone, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: - tensor_clone = ggml_silu(ctx, src0_clone); + tensor_clone = ggml_silu(ggml_ctx, src0_clone); break; case GGML_UNARY_OP_GELU: - tensor_clone = ggml_gelu(ctx, src0_clone); + tensor_clone = ggml_gelu(ggml_ctx, src0_clone); break; case GGML_UNARY_OP_RELU: - tensor_clone = ggml_relu(ctx, src0_clone); + tensor_clone = ggml_relu(ggml_ctx, src0_clone); break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; @@ -5134,40 +5511,40 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { - tensor_clone = ggml_dup(ctx, src0_clone); + tensor_clone = ggml_dup(ggml_ctx, src0_clone); tensor_clone->type = tensor->type; } else { - tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone); + tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone); } } else if (tensor->op == GGML_OP_CONT) { - tensor_clone = ggml_cont_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_RESHAPE) { - tensor_clone = ggml_reshape_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_VIEW) { - tensor_clone = ggml_view_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); + tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); } else if (tensor->op == GGML_OP_PERMUTE) { int32_t * params = (int32_t *)tensor->op_params; - tensor_clone = ggml_permute(ctx, src0_clone, params[0], params[1], params[2], params[3]); + tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]); } else if (tensor->op == GGML_OP_TRANSPOSE) { - tensor_clone = ggml_transpose(ctx, src0_clone); + tensor_clone = ggml_transpose(ggml_ctx, src0_clone); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ASSERT(false); } // Disable vulkan here to avoid the hooks in ggml.c - vk_disable = true; + ctx->disable = true; - ggml_cgraph * cgraph = ggml_new_graph(ctx); + ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_clone); - ggml_graph_compute_with_ctx(ctx, cgraph, 8); + ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); - vk_disable = false; + ctx->disable = false; ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(tensor_clone, "tensor_clone"); + ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); } comp_size = ggml_nbytes(tensor_clone); @@ -5183,10 +5560,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * free(src1_buffer); } - ggml_free(ctx); + ggml_free(ggml_ctx); } -void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { if (params->ith != 0) { return; } @@ -5208,11 +5585,11 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (extra->offset + tensor_size >= extra->buffer_gpu.size) { - tensor_size = extra->buffer_gpu.size - (extra->offset); + if (extra->offset + tensor_size >= extra->buffer_gpu->size) { + tensor_size = extra->buffer_gpu->size - (extra->offset); } - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } float first_error_result = -1.0f; @@ -5339,4 +5716,10 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) free(tensor_data); } } + +void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + ggml_vk_check_results_0(ctx, params, tensor); +} #endif diff --git a/ggml-vulkan.h b/ggml-vulkan.h index eb8a148e2..9645126b4 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -8,24 +8,29 @@ extern "C" { #endif #define GGML_VK_NAME "Vulkan" +#define GGML_VK_MAX_DEVICES 16 -GGML_API void ggml_vk_init(void); +GGML_API void ggml_vk_init_cpu_assist(void); -GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); -GGML_API void ggml_vk_preallocate_buffers(void); -GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node); -GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); +GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void); +GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node); +GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef GGML_VULKAN_CHECK_RESULTS -void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor); +void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); #endif -GGML_API void ggml_vk_graph_cleanup(void); +GGML_API void ggml_vk_graph_cleanup_cpu_assist(void); +GGML_API void ggml_vk_free_cpu_assist(void); // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void); +GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); +GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); diff --git a/ggml.c b/ggml.c index a7c179f4c..2afdd9df4 100644 --- a/ggml.c +++ b/ggml.c @@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { #elif defined(GGML_USE_CLBLAST) ggml_cl_init(); #elif defined(GGML_USE_VULKAN) - ggml_vk_init(); + ggml_vk_init_cpu_assist(); #elif defined(GGML_USE_SYCL) ggml_init_sycl(); #endif @@ -2470,7 +2470,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { - max_size = MAX(max_size, ggml_nbytes(tensor)); + size_t bytes = ggml_nbytes(tensor); + max_size = MAX(max_size, bytes); } return max_size; @@ -11887,8 +11888,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims - dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); - dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); } static void ggml_compute_forward_rope_f32( @@ -14847,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #elif defined(GGML_USE_VULKAN) - const bool skip_cpu = ggml_vk_compute_forward(params, tensor); + const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS if (skip_cpu) { - ggml_vk_check_results_1(params, tensor); + ggml_vk_check_results_1_cpu_assist(params, tensor); } #endif if (skip_cpu) { @@ -17266,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { #ifdef GGML_USE_VULKAN for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); + ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]); } - ggml_vk_preallocate_buffers(); + ggml_vk_preallocate_buffers_cpu_assist(); for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); + ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1); } #endif @@ -17327,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { } #ifdef GGML_USE_VULKAN - ggml_vk_graph_cleanup(); + ggml_vk_graph_cleanup_cpu_assist(); #endif // performance stats (graph) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ed8e26f83..1cfd41c0b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -104,6 +104,7 @@ class MODEL_ARCH(IntEnum): CODESHELL = auto() ORION = auto() INTERNLM2 = auto() + MINICPM = auto() class MODEL_TENSOR(IntEnum): @@ -156,6 +157,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -464,6 +466,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.MINICPM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], # TODO } diff --git a/llama.cpp b/llama.cpp index 884f5f5a7..6efdc0962 100644 --- a/llama.cpp +++ b/llama.cpp @@ -229,6 +229,7 @@ enum llm_arch { LLM_ARCH_CODESHELL, LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, + LLM_ARCH_MINICPM, LLM_ARCH_UNKNOWN, }; @@ -252,6 +253,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, }; enum llm_kv { @@ -714,6 +716,29 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_MINICPM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1358,7 +1383,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { #elif defined(GGML_USE_CUBLAS) buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(); + buft = ggml_backend_vk_buffer_type(gpu); #elif defined(GGML_USE_SYCL) buft = ggml_backend_sycl_buffer_type(gpu); #elif defined(GGML_USE_CLBLAST) @@ -1395,6 +1420,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g GGML_UNUSED(tensor_split); } +static size_t llama_get_device_count() { +#if defined(GGML_USE_CUBLAS) + return ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_VULKAN) + return ggml_backend_vk_get_device_count(); +#else + return 1; +#endif +} + +static size_t llama_get_device_memory(int device) { +#if defined(GGML_USE_CUBLAS) + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(device, &total, &free); + return free; +#elif defined(GGML_USE_VULKAN) + size_t total; + size_t free; + ggml_backend_vk_get_device_memory(device, &total, &free); + return free; +#else + return 1; + GGML_UNUSED(device); +#endif +} + // // globals // @@ -1418,6 +1470,7 @@ enum e_model { MODEL_UNKNOWN, MODEL_0_5B, MODEL_1B, + MODEL_2B, MODEL_3B, MODEL_4B, MODEL_7B, @@ -1769,6 +1822,10 @@ struct llama_context { ggml_backend_free(backend); } +#ifdef GGML_USE_VULKAN + ggml_vk_free_cpu_assist(); +#endif + ggml_backend_buffer_free(buf_input); ggml_free(ctx_input); } @@ -2794,6 +2851,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { case MODEL_1B: return "1B"; + case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; @@ -2933,6 +2991,13 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM: + { + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_2B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3474,22 +3539,18 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } -#ifdef GGML_USE_CUBLAS if (split_mode == LLAMA_SPLIT_LAYER) { // calculate the split points - int device_count = ggml_backend_cuda_get_device_count(); + int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); - float splits[GGML_CUDA_MAX_DEVICES]; + std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - size_t total; - size_t free; - ggml_backend_cuda_get_device_memory(i, &total, &free); - splits[i] = free; + splits[i] = llama_get_device_memory(i); } } else { - std::copy(tensor_split, tensor_split + device_count, splits); + std::copy(tensor_split, tensor_split + device_count, splits.begin()); } // sum and normalize the splits to get the split points @@ -3505,19 +3566,17 @@ static bool llm_load_tensors( // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); model.buft_output = llama_default_buffer_type_offload(layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); } - } else -#endif - { + } else { ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); @@ -3596,13 +3655,16 @@ static bool llm_load_tensors( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: + case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + if (model.arch != LLM_ARCH_MINICPM){ + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } } for (int i = 0; i < n_layer; ++i) { @@ -6853,6 +6915,153 @@ struct llm_build_context { return gf; } + // ref: https://arxiv.org/abs/2203.03466 + // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 + // based on the original build_llama() function + struct ggml_cgraph * build_minicpm() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const int64_t n_embd = hparams.n_embd; + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", -1); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", -1); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph( @@ -7015,6 +7224,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_internlm2(); } break; + case LLM_ARCH_MINICPM: + { + result = llm.build_minicpm(); + } break; default: GGML_ASSERT(false); } @@ -9778,8 +9991,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -9818,9 +10031,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } - //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; - //} + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { + new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K @@ -10816,13 +11029,15 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_VULKAN) if (model->n_gpu_layers > 0) { - ggml_backend_t backend = ggml_backend_vk_init(); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); - llama_free(ctx); - return nullptr; + for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_vk_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); } - ctx->backends.push_back(backend); } #elif defined(GGML_USE_SYCL) if (model->n_gpu_layers > 0) { diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 0b83cdbbc..30bbac321 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -14,16 +14,17 @@ # - Might be unstable! # # Usage: -# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] +# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive] # -# --port: port number, default is 8888 -# --repo: path to a repo containing GGUF model files -# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input -# --backend: cpu, cuda, metal, opencl, depends on the OS -# --gpu-id: gpu id, default is 0 -# --n-parallel: number of parallel requests, default is 8 -# --n-kv: KV cache size, default is 4096 -# --verbose: verbose output +# --port: port number, default is 8888 +# --repo: path to a repo containing GGUF model files +# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input +# --backend: cpu, cuda, metal, opencl, depends on the OS +# --gpu-id: gpu id, default is 0 +# --n-parallel: number of parallel requests, default is 8 +# --n-kv: KV cache size, default is 4096 +# --verbose: verbose output +# --non-interactive: run without asking a permission to run # # Example: # @@ -47,6 +48,7 @@ if ! command -v make &> /dev/null; then fi # parse arguments +is_interactive=1 port=8888 repo="" wtype="" @@ -66,15 +68,16 @@ verbose=0 function print_usage { printf "Usage:\n" - printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" - printf " --port: port number, default is 8888\n" - printf " --repo: path to a repo containing GGUF model files\n" - printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" - printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" - printf " --gpu-id: gpu id, default is 0\n" - printf " --n-parallel: number of parallel requests, default is 8\n" - printf " --n-kv: KV cache size, default is 4096\n" - printf " --verbose: verbose output\n\n" + printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n" + printf " --port: port number, default is 8888\n" + printf " --repo: path to a repo containing GGUF model files\n" + printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" + printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" + printf " --gpu-id: gpu id, default is 0\n" + printf " --n-parallel: number of parallel requests, default is 8\n" + printf " --n-kv: KV cache size, default is 4096\n" + printf " --verbose: verbose output\n\n" + printf " --non-interactive: run without asking a permission to run\n" printf "Example:\n\n" printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' } @@ -82,6 +85,10 @@ function print_usage { while [[ $# -gt 0 ]]; do key="$1" case $key in + --non-interactive) + is_interactive=0 + shift + ;; --port) port="$2" shift @@ -176,31 +183,32 @@ repos=( "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" ) +if [ $is_interactive -eq 1 ]; then + printf "\n" + printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" + printf " Based on the options that follow, the script might download a model file\n" + printf " from the internet, which can be a few GBs in size. The script will also\n" + printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" + printf "\n" + printf " Upon success, an HTTP server will be started and it will serve the selected\n" + printf " model using llama.cpp for demonstration purposes.\n" + printf "\n" + printf " Please note:\n" + printf "\n" + printf " - All new data will be stored in the current folder\n" + printf " - The server will be listening on all network interfaces\n" + printf " - The server will run with default settings which are not always optimal\n" + printf " - Do not judge the quality of a model based on the results from this script\n" + printf " - Do not use this script to benchmark llama.cpp\n" + printf " - Do not use this script in production\n" + printf " - This script is only for demonstration purposes\n" + printf "\n" + printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" + printf "\n" + printf " Press Enter to continue ...\n\n" -printf "\n" -printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" -printf " Based on the options that follow, the script might download a model file\n" -printf " from the internet, which can be a few GBs in size. The script will also\n" -printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" -printf "\n" -printf " Upon success, an HTTP server will be started and it will serve the selected\n" -printf " model using llama.cpp for demonstration purposes.\n" -printf "\n" -printf " Please note:\n" -printf "\n" -printf " - All new data will be stored in the current folder\n" -printf " - The server will be listening on all network interfaces\n" -printf " - The server will run with default settings which are not always optimal\n" -printf " - Do not judge the quality of a model based on the results from this script\n" -printf " - Do not use this script to benchmark llama.cpp\n" -printf " - Do not use this script in production\n" -printf " - This script is only for demonstration purposes\n" -printf "\n" -printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" -printf "\n" -printf " Press Enter to continue ...\n\n" - -read + read +fi if [[ -z "$repo" ]]; then printf "[+] No repo provided from the command line\n"