From 7d51a13c9b81e5edfa21a841dbdaaa95bea48840 Mon Sep 17 00:00:00 2001 From: djw Date: Mon, 21 Jul 2025 12:26:14 +0000 Subject: [PATCH] support npu --- csrc/ktransformers_ext/CMakeLists.txt | 68 +- ktransformers/local_chat_npu.py | 257 + ktransformers/models/custom_cache.py | 65 +- ktransformers/operators/attention.py | 8 +- .../flashinfer_batch_prefill_wrapper.py | 5 +- ktransformers/operators/flashinfer_wrapper.py | 6 +- ktransformers/operators/linear.py | 10 +- ktransformers/optimize/optimize.py | 31 +- .../DeepSeek-V3-Chat-800IA2-npu.yaml | 0 .../npu/DeepSeek-V3-Chat-npu.yaml | 76 + ktransformers/server/args.py | 4 + ktransformers/server/backend/args.py | 1 + .../backend/interfaces/ktransformers.py | 297 +- .../server/backend/interfaces/transformers.py | 149 +- ktransformers/server/main.py | 93 +- .../server/utils/create_interface.py | 9 +- ktransformers/util/ascend/ascend_utils.py | 210 + ktransformers/util/custom_loader.py | 57 +- ktransformers/util/npu_graph.py | 77 + ktransformers/util/npu_graph_runner.py | 218 + ktransformers/util/utils.py | 371 +- merge_tensors/merge_safetensor_gguf.py | 39 +- setup.py | 32 +- third_party/llamafile/iqk_mul_mat.inc | 4930 +------------- third_party/llamafile/iqk_mul_mat_arm.inc | 5866 +++++++++++++++++ third_party/llamafile/iqk_mul_mat_arm80.cpp | 10 + third_party/llamafile/iqk_mul_mat_x86.inc | 4925 ++++++++++++++ third_party/llamafile/sgemm.cpp | 209 +- third_party/llamafile/sgemm_arm.cpp | 204 + third_party/llamafile/sgemm_x86.cpp | 204 + .../llamafile/tinyblas_cpu_mixmul_arm80.cpp | 1 + third_party/llamafile/tinyblas_cpu_sgemm.inc | 366 +- .../llamafile/tinyblas_cpu_sgemm_arm.inc | 471 ++ .../llamafile/tinyblas_cpu_sgemm_x86.inc | 361 + 34 files changed, 14004 insertions(+), 5626 deletions(-) create mode 100644 ktransformers/local_chat_npu.py rename ktransformers/optimize/optimize_rules/{ => npu}/DeepSeek-V3-Chat-800IA2-npu.yaml (100%) create mode 100644 ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-npu.yaml create mode 100644 ktransformers/util/ascend/ascend_utils.py create mode 100644 ktransformers/util/npu_graph.py create mode 100644 ktransformers/util/npu_graph_runner.py create mode 100644 third_party/llamafile/iqk_mul_mat_arm.inc create mode 100644 third_party/llamafile/iqk_mul_mat_arm80.cpp create mode 100644 third_party/llamafile/iqk_mul_mat_x86.inc create mode 100644 third_party/llamafile/sgemm_arm.cpp create mode 100644 third_party/llamafile/sgemm_x86.cpp create mode 100644 third_party/llamafile/tinyblas_cpu_sgemm_arm.inc create mode 100644 third_party/llamafile/tinyblas_cpu_sgemm_x86.inc diff --git a/csrc/ktransformers_ext/CMakeLists.txt b/csrc/ktransformers_ext/CMakeLists.txt index cbee533..5b818b3 100644 --- a/csrc/ktransformers_ext/CMakeLists.txt +++ b/csrc/ktransformers_ext/CMakeLists.txt @@ -44,6 +44,10 @@ option(KTRANSFORMERS_USE_ROCM "ktransformers: use ROCM" option(KTRANSFORMERS_USE_XPU "ktransformers: use XPU" OFF) option(KTRANSFORMERS_USE_NPU "ktransformers: use NPU" OFF) +if(KTRANSFORMERS_USE_NPU) + add_definitions(-DKTRANSFORMERS_USE_NPU=1) +endif() + # Architecture specific # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue @@ -90,6 +94,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR endif () set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) else() + if(KTRANSFORMERS_USE_NPU) + list(APPEND ARCH_FLAGS -march=armv8.2-a+fp16+fp16fml+dotprod -lnuma) + endif() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") list(APPEND ARCH_FLAGS -mfp16-format=ieee) @@ -117,37 +124,38 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) message(STATUS "x86 detected") - set(HOST_IS_X86 TRUE) - set(HAS_AVX512 TRUE) - set(__HAS_AMX__ TRUE) - add_compile_definitions(__x86_64__) - # check AVX512 - execute_process( - COMMAND lscpu - OUTPUT_VARIABLE LSCPU_OUTPUT - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - # message(STATUS "LSCPU_OUTPUT: ${LSCPU_OUTPUT}") + if(NOT KTRANSFORMERS_USE_NPU) + set(HOST_IS_X86 TRUE) + set(HAS_AVX512 TRUE) + set(__HAS_AMX__ TRUE) + add_compile_definitions(__x86_64__) + # check AVX512 + execute_process( + COMMAND lscpu + OUTPUT_VARIABLE LSCPU_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + # message(STATUS "LSCPU_OUTPUT: ${LSCPU_OUTPUT}") - string(FIND "${LSCPU_OUTPUT}" "avx512" COMPILER_SUPPORTS_AVX512F) - - if (COMPILER_SUPPORTS_AVX512F GREATER -1) - message(STATUS "Compiler and CPU support AVX512F (tested by compiling a program)") - add_compile_definitions(__HAS_AVX512F__) - else() - message(STATUS "Compiler and/or CPU do NOT support AVX512F") - set(HAS_AVX512 False) - endif() + string(FIND "${LSCPU_OUTPUT}" "avx512" COMPILER_SUPPORTS_AVX512F) + + if (COMPILER_SUPPORTS_AVX512F GREATER -1) + message(STATUS "Compiler and CPU support AVX512F (tested by compiling a program)") + add_compile_definitions(__HAS_AVX512F__) + else() + message(STATUS "Compiler and/or CPU do NOT support AVX512F") + set(HAS_AVX512 False) + endif() - # check AMX - string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX) - - if(COMPILER_SUPPORTS_AMX GREATER -1) - message(STATUS "Compiler supports AMX") - add_compile_definitions(__HAS_AMX__) - else() - message(STATUS "Compiler does NOT support AMX") - endif() + # check AMX + string(FIND "${LSCPU_OUTPUT}" "amx" COMPILER_SUPPORTS_AMX) + + if(COMPILER_SUPPORTS_AMX GREATER -1) + message(STATUS "Compiler supports AMX") + add_compile_definitions(__HAS_AMX__) + else() + message(STATUS "Compiler does NOT support AMX") + endif() if (MSVC) # instruction set detection for MSVC only if (LLAMA_NATIVE) @@ -281,6 +289,8 @@ if (WIN32) include_directories("$ENV{CUDA_PATH}/include") add_compile_definitions(KTRANSFORMERS_USE_CUDA=1) elseif (UNIX) + + if (KTRANSFORMERS_USE_ROCM) find_package(HIP REQUIRED) if(HIP_FOUND) diff --git a/ktransformers/local_chat_npu.py b/ktransformers/local_chat_npu.py new file mode 100644 index 0000000..57b1ead --- /dev/null +++ b/ktransformers/local_chat_npu.py @@ -0,0 +1,257 @@ +""" +Description : +Author : Boxin Zhang, Azure-Tang +Version : 0.1.0 +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +""" + +import os +import platform +import sys + +project_dir = os.path.dirname(os.path.dirname(__file__)) +sys.path.insert(0, project_dir) +import torch +import torch_npu +from torch_npu.contrib import transfer_to_npu +import torch.distributed as dist + +import logging +from transformers import ( + AutoTokenizer, + AutoConfig, + AutoModelForCausalLM, + GenerationConfig, + TextStreamer, +) +import json +import fire +from ktransformers.optimize.optimize import optimize_and_load_gguf +from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM +from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM +from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM +from ktransformers.models.modeling_llama import LlamaForCausalLM +from ktransformers.models.modeling_mixtral import MixtralForCausalLM +from ktransformers.util.utils import prefill_and_generate, get_compute_capability +from ktransformers.util.ascend.ascend_utils import get_absort_weight, setup_model_parallel, get_tensor_parallel_group +from ktransformers.util import utils +from ktransformers.models.custom_cache import StaticCache +from ktransformers.server.config.config import Config +from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled +from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor + +custom_models = { + "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM, + "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM, + "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM, + "LlamaForCausalLM": LlamaForCausalLM, + "MixtralForCausalLM": MixtralForCausalLM, +} +torch.npu.config.allow_internal_format = True + +ktransformer_rules_dir = ( + os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/" +) +default_optimize_rules = { + "DeepseekV3ForCausalLM": ktransformer_rules_dir + "npu/DeepSeek-V3-Chat.yaml", +} +torch.npu.set_compile_mode(jit_compile=False) + + +import sys, signal, faulthandler +faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True, chain=False) + + +def local_chat( + model_path: str | None = None, + optimize_config_path: str = None, + gguf_path: str | None = None, + max_new_tokens: int = 1000, + cpu_infer: int = Config().cpu_infer, + use_cuda_graph: bool = False, + prompt_file : str | None = None, + mode: str = "normal", + force_think: bool = False, + chunk_size: int = utils._MAX_CHUNK_SIZE, + q4_gguf_path: str | None = None, + tp: int = 1, +): + utils.USE_NPU_GRAPH = use_cuda_graph + torch.npu.config.allow_internal_format = False + torch.set_grad_enabled(False) + Config().cpu_infer = cpu_infer + + local_rank, world_size = setup_model_parallel(tp=tp) + if utils.CUR_DEVICE is None: + utils.CUR_DEVICE = f"npu:{torch.npu.current_device()}" + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if use_cuda_graph: + from ktransformers.util import npu_graph_runner + npu_graph_runner.LAYER_ID = config.num_hidden_layers + if mode == 'long_context': + assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode" + torch.set_default_dtype(torch.float16) + else: + torch.set_default_dtype(config.torch_dtype) + + with torch.device("meta"): + if config.architectures[0] in custom_models: + print("using custom modeling_xxx.py.") + if ( + "Qwen2Moe" in config.architectures[0] + ): # Qwen2Moe must use flash_attention_2 to avoid overflow. + config._attn_implementation = "flash_attention_2" + if "Llama" in config.architectures[0]: + config._attn_implementation = "eager" + if "Mixtral" in config.architectures[0]: + config._attn_implementation = "flash_attention_2" + + model = custom_models[config.architectures[0]](config) + else: + model = AutoModelForCausalLM.from_config( + config, trust_remote_code=True, attn_implementation="flash_attention_2" + ) + + if optimize_config_path is None: + if config.architectures[0] in default_optimize_rules: + print("using default_optimize_rule for", config.architectures[0]) if local_rank == 0 else None + optimize_config_path = default_optimize_rules[config.architectures[0]] + print(f'{optimize_config_path=}') if local_rank == 0 else None + else: + optimize_config_path = input( + "please input the path of your rule file(yaml file containing optimize rules):" + ) + + if gguf_path is None: + gguf_path = input( + "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):" + ) + optimize_and_load_gguf(model, optimize_config_path, gguf_path, config, q4_gguf_path=q4_gguf_path) + get_absort_weight(model, config) + + try: + model.generation_config = GenerationConfig.from_pretrained(model_path) + except Exception as e: + print(f"generation config can't auto create, make default. Message: {e}") + gen_config = GenerationConfig( + temperature=0.6, + top_p=0.95, + do_sample=True + ) + model.generation_config = gen_config + # model.generation_config = GenerationConfig.from_pretrained(model_path) + if model.generation_config.pad_token_id is None: + model.generation_config.pad_token_id = model.generation_config.eos_token_id + model.eval() + logging.basicConfig(level=logging.INFO) + + system = platform.system() + if system == "Windows": + os.system("cls") if local_rank == 0 else None + else: + os.system("clear") if local_rank == 0 else None + + print(f"{model=}") if local_rank == 0 else None + + batch_size, seq_length = 1, 1024 + device_map = model.gguf_loader.tensor_device_map + static_cache = StaticCache( + config = model.config, max_batch_size = batch_size, max_cache_len = seq_length + max_new_tokens, device = device_map, + dtype = model.dtype + ) + chunk_size = int(chunk_size) + new_chunk_size = min(max(chunk_size, 512), utils._MAX_CHUNK_SIZE) + if new_chunk_size != chunk_size: + chunk_size = new_chunk_size + print(f'[WARN] Chunk size reset to legal value between [512, {utils._MAX_CHUNK_SIZE}] which is {chunk_size}.') + + torch.distributed.barrier() + while True: + if local_rank == 0: + try: + content = input("Chat: ").strip() + except KeyboardInterrupt: + dist.barrier() + print('Exit all ranks with KeyboardInterrupt!') + sys.exit(0) + if content.startswith('"""'): # prefix """ + # multi lines input + content = content[3:] + "\n" + while True: + line = input("") + if line.endswith('"""'): + # end multi lines input + line = line[:-3] # suffix """ + if line: + content += line + "\n" + break + else: + content += line + "\n" + + if content == "": + if prompt_file != None: + content = open(prompt_file, "r").read() + else: + continue + elif os.path.isfile(content): + f = open(content, "r") + content = f.readlines() + f.close() + else: + content = [f"{len(content)},{max_new_tokens},{content}"] + else: + content = [""] + + for line in content: + content_tensor = torch.tensor(bytearray(line.encode()), dtype=torch.uint8).to(device=utils.CUR_DEVICE) + if world_size > 1: + content_size = torch.tensor(len(content_tensor), dtype=torch.int64).to(device=utils.CUR_DEVICE) + all_content_sizes = [torch.zeros((1,), dtype=torch.int64).to(device=utils.CUR_DEVICE) for _ in range(world_size)] + dist.barrier() + dist.all_gather(all_content_sizes, content_size) + max_content_size = max([size.item() for size in all_content_sizes]) + + padded_content_tensor = torch.zeros((max_content_size,), dtype=torch.uint8).to(device=utils.CUR_DEVICE) + padded_content_tensor[:len(content_tensor)] = content_tensor + + all_content_tensors = [torch.zeros((max_content_size,), dtype=torch.uint8).to(device=utils.CUR_DEVICE) for _ in range(world_size)] + dist.barrier() + dist.all_gather(all_content_tensors, padded_content_tensor) + content_tensor = all_content_tensors[0][:all_content_sizes[0].item()] + line = bytes(content_tensor.cpu().numpy()).decode() + + parts = line.split(",") + input_tokens = int(parts[0]) + max_new_tokens = int(parts[1]) + line = line[line.index(",", line.index(",") + 1) + 1:] + + messages = [{"role": "user", "content": line}] + input_tensor = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_tensors="pt" + ) + if force_think: + token_thinks = torch.tensor([tokenizer.encode("\\n",add_special_tokens=False)],device=input_tensor.device) + input_tensor = torch.cat( + [input_tensor, token_thinks], dim=1 + ) + if mode == 'long_context': + assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \ + "please change max_seq_len in ~/.ktransformers/config.yaml" + + if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA: + generated = prefill_and_generate( + model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size, + use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim, + static_cache=static_cache + ) + else: + generated = prefill_and_generate( + model, tokenizer, input_tensor.to(device=utils.CUR_DEVICE), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_size = chunk_size, + static_cache=static_cache + ) + + +if __name__ == "__main__": + fire.Fire(local_chat) \ No newline at end of file diff --git a/ktransformers/models/custom_cache.py b/ktransformers/models/custom_cache.py index 350af73..66b05a9 100644 --- a/ktransformers/models/custom_cache.py +++ b/ktransformers/models/custom_cache.py @@ -16,6 +16,16 @@ try: from ktransformers.server.balance_serve.settings import sched_ext except: print("no balance_serve") + + +try: + import torch_npu + from ktransformers.util import utils + + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + class StaticCache(transformers.StaticCache): """ Static Cache class to be used with `torch.compile(model)`. @@ -37,6 +47,10 @@ class StaticCache(transformers.StaticCache): def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device: torch.device| dict, dtype=None) -> None: Cache.__init__(self) self.max_batch_size = max_batch_size + + if use_torch_npu: + self.position = [0] + self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads if config.architectures[0] == "DeepseekV3ForCausalLM": @@ -56,8 +70,18 @@ class StaticCache(transformers.StaticCache): cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) if config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM": # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically - self.page_size = 64 - self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size + + if use_torch_npu: + self.page_size = 128 + self.page_size_tensor = torch.tensor( + self.page_size, + dtype=torch.int32, + ).npu() + self.max_pages_per_batch = (self.max_cache_len + self.page_size - 1) // self.page_size + self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size * self.max_batch_size + else: + self.page_size = 64 + self.max_pages = (self.max_cache_len + self.page_size - 1) // self.page_size latent_shape = (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim) self.kv_lora_rank = config.kv_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim @@ -71,9 +95,14 @@ class StaticCache(transformers.StaticCache): target_device = device if target_device not in self.page_table_map: - page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device) - for seq_id in range(max_batch_size): - page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device) + if use_torch_npu: + page_table = torch.zeros((max_batch_size, self.max_pages_per_batch), dtype=torch.int32, device=target_device) + for seq_id in range(max_batch_size): + page_table[seq_id, :] = torch.arange(seq_id * self.max_pages_per_batch, seq_id * self.max_pages_per_batch + self.max_pages_per_batch, dtype=torch.int32, device=target_device) + else: + page_table = torch.zeros((max_batch_size, self.max_pages), dtype=torch.int32, device=target_device) + for seq_id in range(max_batch_size): + page_table[seq_id, :] = torch.arange(seq_id * self.max_pages, seq_id * self.max_pages + self.max_pages, dtype=torch.int32, device=target_device) self.page_table_map[target_device] = page_table self.page_table_list.append(self.page_table_map[target_device]) @@ -140,11 +169,24 @@ class StaticCache(transformers.StaticCache): self.past_tokens[layer_idx] += cache_position.size(0) #print(cache_position) if self.is_MLA: - page_idx = cache_position // self.page_size - page_offset = cache_position % self.page_size - # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim) - k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states - k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states + if use_torch_npu: + page_idx = cache_position // self.page_size_tensor + page_offset = cache_position % self.page_size_tensor + + page_idx = page_idx.unsqueeze(0).expand(self.max_batch_size, -1) + page_offset = page_offset.unsqueeze(0).expand(self.max_batch_size, -1) + + page_idx_offset = torch.arange(self.max_batch_size, device=page_idx.device) * self.max_pages_per_batch + page_idx = page_idx + page_idx_offset.unsqueeze(1) + + combined = torch.cat([key_states, value_states], dim=-1) + combined = combined.contiguous() + else: + page_idx = cache_position // self.page_size + page_offset = cache_position % self.page_size + # key shape (self.max_pages, self.page_size, 1, config.kv_lora_rank + config.qk_rope_head_dim) + k_out[page_idx, page_offset, :, :self.kv_lora_rank] = key_states + k_out[page_idx, page_offset, :, self.kv_lora_rank:] = value_states return k_out, self.page_table_list[layer_idx] else: k_out[:, :, cache_position] = key_states @@ -178,6 +220,9 @@ class StaticCache(transformers.StaticCache): if self.value_cache[layer_idx] is not None: self.value_cache[layer_idx].zero_() self.past_tokens[layer_idx] = 0 + + if use_torch_npu: + self.position = [0] def remove_suffix(self, start_pos): for layer_idx in range(len(self.key_cache)): diff --git a/ktransformers/operators/attention.py b/ktransformers/operators/attention.py index 9dfdbdc..e478d09 100644 --- a/ktransformers/operators/attention.py +++ b/ktransformers/operators/attention.py @@ -27,8 +27,12 @@ try: from flash_attn import flash_attn_func except: pass -from ktransformers.operators.triton_attention import decode_attention_fwd_grouped -from ktransformers.operators.triton_attention_prefill import context_attention_fwd +try: + from ktransformers.operators.triton_attention import decode_attention_fwd_grouped + from ktransformers.operators.triton_attention_prefill import context_attention_fwd +except: + Warning("triton not found, if you are using npu, ignore this.") + import os from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled if flashinfer_enabled: diff --git a/ktransformers/operators/flashinfer_batch_prefill_wrapper.py b/ktransformers/operators/flashinfer_batch_prefill_wrapper.py index 287affb..e9ee585 100644 --- a/ktransformers/operators/flashinfer_batch_prefill_wrapper.py +++ b/ktransformers/operators/flashinfer_batch_prefill_wrapper.py @@ -1,5 +1,8 @@ import torch -import flashinfer +try: + import flashinfer +except: + Warning("flashinfer not found, if you are using npu, ignore this.") import gc try: from flash_attn import flash_attn_with_kvcache diff --git a/ktransformers/operators/flashinfer_wrapper.py b/ktransformers/operators/flashinfer_wrapper.py index 81fd75e..07b04e4 100644 --- a/ktransformers/operators/flashinfer_wrapper.py +++ b/ktransformers/operators/flashinfer_wrapper.py @@ -5,7 +5,11 @@ Version : 0.2.3 ''' import torch import os -from ktransformers.operators.triton_attention import decode_attention_fwd_grouped + +try: + from ktransformers.operators.triton_attention import decode_attention_fwd_grouped +except: + Warning("triton not found, if you are using npu, ignore this.") flashinfer_enabled = False diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index 654c9f9..e2184ff 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -14,7 +14,15 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved. import ctypes import torch from torch import Tensor, nn -if not torch.xpu.is_available(): + +try: + import torch_npu + + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + +if not torch.xpu.is_available() and not use_torch_npu: import KTransformersOps import vLLMMarlin from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader diff --git a/ktransformers/optimize/optimize.py b/ktransformers/optimize/optimize.py index bbe08c8..3052fce 100644 --- a/ktransformers/optimize/optimize.py +++ b/ktransformers/optimize/optimize.py @@ -16,6 +16,7 @@ from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory from ktransformers.util.utils import set_module, load_weights import itertools import copy +from ktransformers.util import utils def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''): for name, child in module._modules.items(): @@ -114,7 +115,7 @@ def translate_model_config(model_config: PretrainedConfig): return model_config -def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"): +def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0", q4_gguf_path=""): with open(rule_file, 'r', encoding='utf-8') as f: rule_list = yaml.load(f.read(), Loader=yaml.FullLoader) @@ -123,15 +124,29 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo model_config = translate_model_config(model_config) - weights_loader = ModelLoaderFactory.create_loader(gguf_path) - with torch.device("meta"): - inject(module, optimize_config, model_config, weights_loader) - # pre load lm_head because its big inter result - load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device) - load_weights(module, weights_loader, device=default_device) - module.gguf_loader = weights_loader + if q4_gguf_path: + q4_gguf_loader = GGUFLoader(q4_gguf_path) + utils.Q4_GGUF_LODER = q4_gguf_loader + gguf_loader = GGUFLoader(gguf_path, getattr(model_config, "quantize", None)) + with torch.device("meta"): + inject(module, optimize_config, model_config, gguf_loader) + # pre load lm_head because its big inter result + load_weights(module.lm_head, gguf_loader, "lm_head.") + load_weights(module, gguf_loader) + module.gguf_loader = gguf_loader + + else: + weights_loader = ModelLoaderFactory.create_loader(gguf_path) + with torch.device("meta"): + inject(module, optimize_config, model_config, weights_loader) + # pre load lm_head because its big inter result + load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device) + load_weights(module, weights_loader, device=default_device) + module.gguf_loader = weights_loader del_meta(module) if torch.cuda.is_available(): torch.cuda.empty_cache() elif torch.xpu.is_available(): torch.xpu.empty_cache() + else: + torch.cuda.empty_cache() \ No newline at end of file diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml b/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-800IA2-npu.yaml similarity index 100% rename from ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml rename to ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-800IA2-npu.yaml diff --git a/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-npu.yaml b/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-npu.yaml new file mode 100644 index 0000000..9bd7bc3 --- /dev/null +++ b/ktransformers/optimize/optimize_rules/npu/DeepSeek-V3-Chat-npu.yaml @@ -0,0 +1,76 @@ +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "npu" + prefill_device: "npu" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "npu" + prefill_device: "npu" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "npu" + prefill_device: "npu" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "npu" + prefill_device: "npu" +- match: + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "npu:0" + prefill_device: "npu:0" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "npu" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "npu" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "npu" + prefill_device: "npu" + absorb_for_prefill: False # change this to True to enable long context(prefill may slower). +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/ktransformers/server/args.py b/ktransformers/server/args.py index 748bd47..91b72eb 100644 --- a/ktransformers/server/args.py +++ b/ktransformers/server/args.py @@ -22,6 +22,10 @@ class ArgumentParser: "--device", type=str, default=self.cfg.model_device, help="Warning: Abandoning this parameter" ) parser.add_argument("--architectures", type=str, default=self.cfg.model_name) + + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--q4_gguf_path", type=str, default=None) + parser.add_argument("--gguf_path", type=str, default=self.cfg.gguf_path) parser.add_argument("--optimize_config_path", default=None, type=str, required=False) parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer) diff --git a/ktransformers/server/backend/args.py b/ktransformers/server/backend/args.py index 1c602b1..c16a98a 100644 --- a/ktransformers/server/backend/args.py +++ b/ktransformers/server/backend/args.py @@ -8,6 +8,7 @@ class ConfigArgs(BaseModel): model_dir: Optional[str] = Field(..., description="Path to model directory") optimize_config_path: Optional[str] = Field(None, description="Path of your optimize config yml file") gguf_path: Optional[str] = Field(None, description="Path of your gguf file") + tp: int = Field(None, description="tp size") class Config: protected_namespaces = () diff --git a/ktransformers/server/backend/interfaces/ktransformers.py b/ktransformers/server/backend/interfaces/ktransformers.py index fd2a808..58ba2d7 100644 --- a/ktransformers/server/backend/interfaces/ktransformers.py +++ b/ktransformers/server/backend/interfaces/ktransformers.py @@ -1,4 +1,19 @@ import torch +from torch import nn + +try: + import torch_npu + from ktransformers.util.ascend.ascend_utils import get_absort_weight, setup_model_parallel + from ktransformers.util.utils import get_device, get_all_used_cuda_device + from ktransformers.util import utils + + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + +import os + + from typing import Optional, List import asyncio from transformers import AutoTokenizer, AutoConfig, GenerationConfig @@ -19,6 +34,9 @@ from typing import Optional from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton from ktransformers.server.schemas.endpoints.chat import RawUsage + + + warm_uped = False class KTransformersThreadContext(TransformersThreadContext): @@ -26,8 +44,15 @@ class KTransformersThreadContext(TransformersThreadContext): class KTransformersInterface(TransformersInterface): - def __init__(self, args: ConfigArgs = default_args): - self.args = args + def __init__(self, args: ConfigArgs = default_args, input_args=None): + if use_torch_npu: + self.args = input_args + self.local_rank, self.world_size = setup_model_parallel(tp=self.args.tp) + if utils.CUR_DEVICE is None: + utils.CUR_DEVICE = f"npu:{torch.npu.current_device()}" + self.args.device = utils.CUR_DEVICE + else: + self.args = args torch.set_grad_enabled(False) self.tokenizer = AutoTokenizer.from_pretrained(args.model_dir, device=args.device, trust_remote_code=args.trust_remote_code) config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=args.trust_remote_code) @@ -47,7 +72,10 @@ class KTransformersInterface(TransformersInterface): with torch.device("meta"): self.model = custom_models[config.architectures[0]](config) - if default_args.optimize_config_path is None: + + if use_torch_npu and input_args.optimize_config_path is not None: + optimize_config_path = input_args.optimize_config_path + elif default_args.optimize_config_path is None: optimize_config_path = default_optimize_rules[config.architectures[0]] else: optimize_config_path = args.optimize_config_path @@ -60,7 +88,14 @@ class KTransformersInterface(TransformersInterface): "please input the path of your gguf file(gguf file in the dir containing input gguf file must all" " belong to current model):" ) - optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config) + + if use_torch_npu: + optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config, q4_gguf_path=input_args.q4_gguf_path) + #提前absorbed + get_absort_weight(self.model, config) + self.model.eval() + else: + optimize_and_load_gguf(self.model, optimize_config_path, gguf_path, config) self.model.generation_config = generation_config self.device_map = self.model.gguf_loader.tensor_device_map # logger.info(f"{args.model_name} loaded from {args.model_dir} to {self.device_map}") @@ -77,9 +112,92 @@ class KTransformersInterface(TransformersInterface): self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id self.streamer = TextStreamer(self.tokenizer) + if use_torch_npu: + self.top_p = torch.tensor([[self.model.generation_config.top_p]], dtype=torch.float16, device=self.args.device) + self.top_k = torch.tensor([[self.model.generation_config.top_k]], dtype=torch.int32, device=self.args.device) + self.temperature = torch.tensor([[self.model.generation_config.temperature]], dtype=torch.float16, device=self.args.device) + self.next_token_fake = torch.tensor([[1]], dtype=torch.int32, device=self.args.device) + self.next_token_probs = torch.tensor([[1.0]], dtype=torch.float16, device=self.args.device) + self._infer_lock = asyncio.Lock() + + self._infer_lock = asyncio.Lock() + def decode_logits_to_token(self, logits: torch.Tensor): + if self.model.generation_config.do_sample: + logits = logits / self.temperature + torch.manual_seed(0) + probs = logits.view(1, self.model.config.vocab_size) + sm = nn.Softmax(dim=-1) + probs = sm(probs).half().npu() + next_token = self.next_token_fake + torch_npu._npu_topk_topp_sampling(probs, self.top_k, self.top_p, next_token, self.next_token_probs) + last = next_token.squeeze(-1) + else: + logits = self.logits_warper(self.inputs.view(1, -1), logits.view(1, -1)) + probs = torch.nn.functional.softmax(logits, dim=-1) + _, last = torch.topk(probs, k=1, dim=-1) + last = last.item() + self.ever_generated_ids.add(last) + return last + + def decode_one_tokens_npu(self): + global warm_uped + + device_map = self.model.gguf_loader.tensor_device_map + torch_device = get_device("blk.0.self_attn", device_map) + torch_device = "cuda:0" if torch_device == "cuda" else torch_device + torch.cuda.set_device(torch_device) + if warm_uped and self.args.use_cuda_graph: + from ktransformers.util.npu_graph_runner import get_or_create_runner, check_runner + if check_runner(self.args.device): + npu_graph_runner = get_or_create_runner(self.args.device) + npu_graph_runner.init(self.args.batch_size, self.seq_length) + self.cuda_graph_runner = npu_graph_runner + utils._USE_NPU_GRAPH = True + self.cuda_graph_runner.capture( + self.model, + self.current_ids, + self.active_cache_position.unsqueeze(0), + self.active_cache_position, + self.cache, + main_device=self.args.device, + return_dict=False, + use_cache=True, + ) + + if hasattr(self, "cuda_graph_runner"): + inputs_embeds = self.model.model.embed_tokens(self.current_ids.to("cpu")).to(self.args.device) + logits = self.cuda_graph_runner( + inputs_embeds, self.active_cache_position.unsqueeze(0), self.active_cache_position + ) + self.cache.change_seq_length(1) + torch.cuda.synchronize() + logits = logits[0, -1, :] + return self.decode_logits_to_token(logits) + + if self.args.use_cuda_graph: + warm_uped = True + + if self.use_static_cache: + logits = self.model( + self.current_ids.to(torch_device), + cache_position=self.active_cache_position, + past_key_values=self.cache, + return_dict=False, + use_cache=True, + )[0] + else: + logits = self.model(self.current_ids, return_dict=False)[0] + self.cache.change_seq_length(1) + logits = logits[0, -1, :] + + return self.decode_logits_to_token(logits) + def decode_one_tokens(self): + if use_torch_npu: + return self.decode_one_tokens_npu() + global warm_uped device_map = self.model.gguf_loader.tensor_device_map @@ -127,9 +245,145 @@ class KTransformersInterface(TransformersInterface): return self.logits_to_token(logits) + @torch.no_grad + def prefill_npu(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None): + input_ids_length = input_ids.shape[-1] + if(input_ids_length >= self.args.cache_lens): + logger.warning(f"input_ids_length {input_ids_length} > cache_lens {self.args.cache_lens}") + self.seq_length = input_ids_length + return + logger.debug(f"input_ids: {input_ids.shape}") + device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0") + device = "cuda:0" if device == "cuda" else device + device = self.args.device + if is_new: + self.ever_generated_ids.clear() + same_prefix = 0 + flat_input_ids = input_ids.flatten() + + if getattr(self, 'generated_ids', None) is None: + self.generated_ids = torch.zeros( + self.args.batch_size, + input_ids.shape[-1] + self.args.max_new_tokens + 1, + dtype=torch.int, + device=self.args.device, + ) + self.seq_length = 1 + + # flat_prev_ids = self.generated_ids.flatten() + # for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1): + # if flat_input_ids[i] == flat_prev_ids[i]: + # same_prefix += 1 + # else: + # break + + logger.debug(f"same prefix len: {same_prefix}") + self.cache.remove_suffix(same_prefix) + self.seq_length = same_prefix + self.cache.position[0] = same_prefix + self.generated_ids = self.generated_ids[..., :same_prefix] + input_ids = input_ids[..., same_prefix:] + input_ids_length = input_ids.shape[-1] + + self.ever_generated_ids.clear() + self.profiler.set_counter("prefill", input_ids_length) + logger.debug(f"input_ids: {input_ids.shape}") + logger.debug(f"generate_ids: {self.generated_ids.shape}") + + former_seq_length = self.seq_length + self.seq_length += input_ids_length + expected_length = min(self.seq_length + self.args.max_new_tokens + 1, self.args.cache_lens) + delta_length = expected_length - self.generated_ids.shape[-1] + if delta_length > 0: + new_generate_ids = torch.zeros( + self.args.batch_size, delta_length, dtype=torch.int, device=self.args.device + ) + self.generated_ids = torch.cat([self.generated_ids, new_generate_ids], dim=-1) + else: + logger.warning(f"seq_length bigger than cache_lens, killed") + exit(0) + + logger.debug(f"cache position: {former_seq_length} to {self.seq_length}") + cache_position = torch.arange(former_seq_length, self.seq_length, device=device) + self.cache.position[0] = self.seq_length + 1 + self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int) + + if not (type(self) is TransformersInterface): + input_ids = input_ids.to("cpu") + + def chunk_prefill(input_ids, cache_position): + inputs_embeds = self.model.model.embed_tokens(input_ids).to(device) + torch.cuda.set_device(device) + if flashinfer_enabled: + MLAWrapperSingleton.need_plan_all() + if self.use_static_cache: + logits = self.model( + inputs_embeds=inputs_embeds, + cache_position=cache_position, + past_key_values=self.cache, + return_dict=False, + use_cache=True, + )[0] + else: + logits = self.model(inputs_embeds=inputs_embeds, return_dict=False)[0] + + return logits + + logits = None + def prefill_wrapper(prof=None): + nonlocal logits + chunk_start = 0 + while chunk_start < input_ids_length: + chunk_end = min(chunk_start + self.args.chunk_size, input_ids_length) + if self.cache != None: + self.cache.cur_idx = cache_position[chunk_start:chunk_end] + logits = chunk_prefill(input_ids[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end]) + chunk_start += self.args.chunk_size + if prof is not None: + prof.step() + if prof is not None: + prof.stop() + if logits is None: + raise ValueError('logits cannot be None') + + + global WARM_UP_SKIP_CNT + prof_prefill = os.environ["PROF_PREFILL"] if "PROF_PREFILL" in os.environ else "0" + if prof_prefill == "1": + experimental_config = torch_npu.profiler._ExperimentalConfig( + aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False + ) + with torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.CPU, + torch_npu.profiler.ProfilerActivity.NPU + ], + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=8, repeat=1, skip_first=0), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./prefill_prof_lm_head"), + record_shapes=True, + profile_memory=True, + with_stack=False, + with_flops=False, + with_modules=False, + experimental_config=experimental_config) as prof: + prefill_wrapper(prof) + else: + prefill_wrapper() + + if flashinfer_enabled: + MLAWrapperSingleton.reset_buffer() + self.prepare_logits_wrapper(input_ids, device, temperature, top_p) + next_token = self.logits_to_token(logits[0, -1, :]) + yield self.append_new_tokens(next_token) + @torch.no_grad def prefill(self, input_ids: torch.Tensor, is_new: bool, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None): + + if use_torch_npu: + return self.prefill_npu(self, input_ids, is_new, temperature, top_p, max_tokens, max_completion_tokens) + input_ids_length = input_ids.shape[-1] if max_tokens is not None: max_completion_tokens = max_tokens @@ -144,6 +398,8 @@ class KTransformersInterface(TransformersInterface): logger.debug(f"input_ids: {input_ids.shape}") device = self.device_map.get("blk.0.self_attn", {}).get("generate_device", "cuda:0") device = "cuda:0" if device == "cuda" else device + if use_torch_npu: + device = self.args.device if is_new: self.ever_generated_ids.clear() @@ -159,16 +415,19 @@ class KTransformersInterface(TransformersInterface): ) self.seq_length = 1 - flat_prev_ids = self.generated_ids.flatten() - for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1): - if flat_input_ids[i] == flat_prev_ids[i]: - same_prefix += 1 - else: - break + if not use_torch_npu: + flat_prev_ids = self.generated_ids.flatten() + for i in range(min(self.seq_length, flat_input_ids.shape[0]) - 1): + if flat_input_ids[i] == flat_prev_ids[i]: + same_prefix += 1 + else: + break logger.debug(f"same prefix len: {same_prefix}") self.cache.remove_suffix(same_prefix) self.seq_length = same_prefix + if use_torch_npu: + self.cache.position[0] = same_prefix self.generated_ids = self.generated_ids[..., :same_prefix] input_ids = input_ids[..., same_prefix:] input_ids_length = input_ids.shape[-1] @@ -193,6 +452,8 @@ class KTransformersInterface(TransformersInterface): logger.debug(f"cache position: {former_seq_length} to {self.seq_length}") cache_position = torch.arange(former_seq_length, self.seq_length, device=device) + if use_torch_npu: + self.cache.position[0] = self.seq_length + 1 self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int) if not (type(self) is TransformersInterface): @@ -248,4 +509,18 @@ class KTransformersInterface(TransformersInterface): decode_time = self.profiler.get_timer_sec('decode'), prefill_count = self.profiler.get_counter('prefill'), decode_count = self.profiler.get_counter('decode'), - ) \ No newline at end of file + ) + + def sync_inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None) -> str: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + async def run_async(): + result = [] + async for chunk in self.inference(local_messages, thread_id, temperature, top_p): + pass + return "" + return loop.run_until_complete(run_async()) + finally: + loop.close() \ No newline at end of file diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py index 78cb73f..d1dcda5 100644 --- a/ktransformers/server/backend/interfaces/transformers.py +++ b/ktransformers/server/backend/interfaces/transformers.py @@ -32,6 +32,20 @@ from ktransformers.server.config.log import logger from ..args import ConfigArgs, default_args from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton + + +try: + import torch_npu + from ktransformers.util import utils + + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + + +import torch.distributed as dist + + # This TextStreamer is a modified version from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/streamers.py class TextStreamer: @@ -191,11 +205,19 @@ class TransformersInterface(BackendInterfaceBase): # input_ids = self.tokenizer.apply_chat_template( # new_messages, return_tensors="pt", add_generation_prompt=True # ).to(self.args.device) - input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True) - # drop token in chat template - if input_str.endswith('\n'): - input_str = input_str[:-len('\n')] - input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device) + + if not use_torch_npu: + input_str: str = self.tokenizer.apply_chat_template(new_messages,tokenize=False,add_generation_prompt=True) + # drop token in chat template + if input_str.endswith('\n'): + input_str = input_str[:-len('\n')] + input_ids = self.tokenizer.encode(input_str, return_tensors="pt").to(self.args.device) + else: + logger.debug(f"new_messages: {new_messages}") + input_ids = self.tokenizer.apply_chat_template( + new_messages, add_generation_prompt=True, return_tensors="pt" + ) + if (self.last_request_id is not None) and self.last_request_id == thread_id: x = self.generated_ids[:,:self.seq_length] y = input_ids[:,:self.seq_length] @@ -212,6 +234,8 @@ class TransformersInterface(BackendInterfaceBase): def append_new_tokens(self, new_tokens: int) -> Optional[str]: self.generated_ids[0, self.seq_length] = new_tokens self.seq_length += 1 + if use_torch_npu: + self.cache.position[0] = self.seq_length return self.streamer.put(new_tokens) @staticmethod @@ -273,14 +297,21 @@ class TransformersInterface(BackendInterfaceBase): top_p = self.model.generation_config.top_p if top_p == 0: top_p = 0.0001 - generation_config, model_kwargs = self.model._prepare_generation_config( - None, max_length=self.args.max_new_tokens, - do_sample=True, - top_k=self.args.top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=self.args.repetition_penalty # change this to modify generate config - ) + + if use_torch_npu: + generation_config, model_kwargs = self.model._prepare_generation_config( + None, do_sample=True, + top_p=top_p, temperature=temperature + ) + else: + generation_config, model_kwargs = self.model._prepare_generation_config( + None, max_length=self.args.max_new_tokens, + do_sample=True, + top_k=self.args.top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=self.args.repetition_penalty # change this to modify generate config + ) self.inputs = inputs self.logits_warper = self.tf_logits_warper(generation_config) @@ -372,7 +403,10 @@ class TransformersInterface(BackendInterfaceBase): cache_position = torch.arange(former_seq_length, self.seq_length, device=self.args.device) self.generated_ids[:, cache_position] = input_ids.to(self.args.device).to(torch.int) - device = input_ids.device + if use_torch_npu: + device = self.args.device + else: + device = input_ids.device if not (type(self) is TransformersInterface): input_ids = input_ids.to("cpu") inputs_embeds = self.model.model.embed_tokens(input_ids).to(device) @@ -420,7 +454,12 @@ class TransformersInterface(BackendInterfaceBase): else: # for's else, if output get max new tokens yield self.streamer.end(), None yield "", "length" - + + if use_torch_npu and self.args.use_cuda_graph: + utils._USE_NPU_GRAPH = False + from ktransformers.util.npu_graph_runner import get_or_create_runner + npu_graph_runner = get_or_create_runner(self.args.device) + npu_graph_runner.destroy() def check_is_new(self, thread_id: str): @@ -436,7 +475,87 @@ class TransformersInterface(BackendInterfaceBase): self.last_request_id = thread_id return True + + async def inference_npu(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None): + self.streamer.reset() + self.profiler.create_and_start_timer("tokenize") + rank = torch.distributed.get_rank() + tp_size = utils.get_tensor_parallel_size() + world_size = torch.distributed.get_world_size() + if isinstance(local_messages, List): + input_ids = self.format_and_tokenize_input_ids(thread_id, local_messages) + elif isinstance(local_messages, str): + #local_messages = local_messages[0]['content'] + input_ids = self.tokenize_prompt(local_messages) + #input_ids = torch.tensor([[6366]], device=input_ids.device) + else: + raise ValueError("local_messages should be List or str") + + if tp_size == world_size and tp_size > 1: + torch.distributed.barrier() + input_size = torch.tensor([input_ids.size(1)], dtype=torch.int64, device=self.args.device) + all_input_sizes = [torch.zeros_like(input_size) for _ in range(world_size)] + dist.all_gather(all_input_sizes, input_size) + + max_input_size = max([size.item() for size in all_input_sizes]) + padded_input_ids = torch.zeros(1, max_input_size, dtype=input_ids.dtype, device=self.args.device) + padded_input_ids[0, :input_ids.size(1)] = input_ids[0] + + all_padded_inputs = [torch.zeros_like(padded_input_ids) for _ in range(world_size)] + dist.all_gather(all_padded_inputs, padded_input_ids) + + original_size = all_input_sizes[0].item() + input_ids = all_padded_inputs[0][:, :original_size] + + if Config().user_force_think: + token_thinks = torch.tensor([self.tokenizer.encode("\n",add_special_tokens=False)],device=input_ids.device) + if not torch.equal(input_ids[0, -token_thinks.shape[-1]:], token_thinks[-1]): + input_ids = torch.cat( + [input_ids, token_thinks], dim=1 + ) + + self.profiler.pause_timer("tokenize") + + self.profiler.create_and_start_timer("prefill") + + if Config().user_force_think: + think = '\n' + if tp_size == world_size and rank != 0: + pass + else: + print(think, end="",flush=True) + yield think, None + + for t in self.prefill(input_ids, self.check_is_new(thread_id), temperature, top_p): + # output think token after prefill done + if t is not None: + print(t, end="",flush=True) + yield t, None + self.profiler.pause_timer("prefill") + + self.profiler.create_and_start_timer("decode") + for t, finish_reason in self.generate(): + if t is not None: + if tp_size == world_size and rank != 0: + pass + else: + print(t, end="",flush=True) + yield t, finish_reason + + if tp_size == world_size and rank != 0: + pass + else: + self.profiler.pause_timer("decode") + self.report_last_time_performance() + async def inference(self, local_messages, thread_id: str, temperature: Optional[float] = None, top_p: Optional[float] = None, max_tokens: Optional[float] = None, max_completion_tokens: Optional[float] = None): + + if use_torch_npu: + async for tok in self.inference_npu(local_messages, thread_id, temperature, top_p): + yield tok + return + + self.streamer.reset() self.profiler.create_and_start_timer("tokenize") if isinstance(local_messages, List): diff --git a/ktransformers/server/main.py b/ktransformers/server/main.py index 3341ee9..2877a96 100644 --- a/ktransformers/server/main.py +++ b/ktransformers/server/main.py @@ -9,7 +9,7 @@ project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) from fastapi.middleware.cors import CORSMiddleware from ktransformers.server.args import ArgumentParser from ktransformers.server.config.config import Config -from ktransformers.server.utils.create_interface import create_interface, GlobalInterface +from ktransformers.server.utils.create_interface import create_interface, GlobalInterface, get_thread_context_manager from fastapi.openapi.utils import get_openapi from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -17,6 +17,21 @@ from ktransformers.server.api import router, post_db_creation_operations from ktransformers.server.utils.sql_utils import Base, SQLUtil from ktransformers.server.config.log import logger +import asyncio +from uuid import uuid4 +import torch.distributed +import subprocess +import tempfile +import atexit + +try: + import torch_npu + from ktransformers.util import utils + + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + def mount_app_routes(mount_app: FastAPI): sql_util = SQLUtil() @@ -100,6 +115,77 @@ def custom_openapi(app): return app.openapi_schema +def main_npu(): + torch.npu.config.allow_internal_format = False + cfg = Config() + + arg_parser = ArgumentParser(cfg) + + args = arg_parser.parse_args() + utils.USE_NPU_GRAPH = args.use_cuda_graph + new_chunk_size = min(max(args.chunk_size, 512), utils._MAX_CHUNK_SIZE) + if new_chunk_size != args.chunk_size: + args.chunk_size = new_chunk_size + print(f'[WARN] Chunk size reset to legal value between [512, {utils._MAX_CHUNK_SIZE}] which is {args.chunk_size}.') + + if args.backend_type == "balance_serve": + import pickle + def cleanup(): + if sched_process.poll() is None: + sched_process.terminate() + + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + pickle.dump(args, temp_file) + temp_file_path = temp_file.name + current_file = __file__ + target_file = os.path.join(os.path.dirname(current_file), "balance_serve", "sched_rpc.py") + target_file = os.path.normpath(target_file) + log_path = os.path.join(args.log_dir, "rpc.log") + log = open(log_path, "a") + sched_process = subprocess.Popen( + ["python3", target_file, "--config", temp_file_path], + stdout=log, + stderr=log + ) + print("sched_rpc started with PID:", sched_process.pid) + atexit.register(cleanup) + create_interface(config=cfg, default_args=cfg, input_args=args) + args.port += torch.distributed.get_rank() + tp_size = utils.get_tensor_parallel_size() + world_size = torch.distributed.get_world_size() + if tp_size == world_size and tp_size > 1: + if torch.distributed.get_rank() == 0: + app = create_app() + custom_openapi(app) + run_api( + app=app, + host=args.host, + port=args.port, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ) + else: + while True: + try: + context = get_thread_context_manager() + id = str(uuid4()) + context.interface.sync_inference("", id) + except Exception as e: + print(f"An error occurred: {e}") + finally: + pass + else: + app = create_app() + custom_openapi(app) + + run_api( + app=app, + host=args.host, + port=args.port, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ) + def main(): cfg = Config() @@ -119,4 +205,7 @@ def main(): ) if __name__ == "__main__": - main() + if use_torch_npu: + main_npu() + else: + main() diff --git a/ktransformers/server/utils/create_interface.py b/ktransformers/server/utils/create_interface.py index 992c831..c91f79e 100644 --- a/ktransformers/server/utils/create_interface.py +++ b/ktransformers/server/utils/create_interface.py @@ -16,7 +16,7 @@ from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface from ktransformers.server.backend.interfaces.transformers import TransformersInterface from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface -def create_interface(config: Config, default_args: ConfigArgs): +def create_interface(config: Config, default_args: ConfigArgs, input_args=None): if config.backend_type=='transformers': from ktransformers.server.backend.interfaces.transformers import TransformersInterface as BackendInterface elif config.backend_type == 'exllamav2': @@ -27,7 +27,12 @@ def create_interface(config: Config, default_args: ConfigArgs): from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface as BackendInterface else: raise NotImplementedError(f'{config.backend_type} not implemented') - GlobalInterface.interface = BackendInterface(default_args) + + if config.backend_type == 'ktransformers': + GlobalInterface.interface = BackendInterface(default_args, input_args) + else: + GlobalInterface.interface = BackendInterface(default_args) + GlobalContextManager.context_manager = ThreadContextManager(GlobalInterface.interface) class GlobalContextManager: diff --git a/ktransformers/util/ascend/ascend_utils.py b/ktransformers/util/ascend/ascend_utils.py new file mode 100644 index 0000000..aa98cd7 --- /dev/null +++ b/ktransformers/util/ascend/ascend_utils.py @@ -0,0 +1,210 @@ +import os +from datetime import timedelta + +import torch +try: + import torch_npu +except: + Warning("torch_npu not found, please install torch_npu for NPU support.") +import torch.distributed as dist + +_DATA_PARALLEL_SIZE = 0 +_TENSOR_PARALLEL_SIZE = 0 +_DATA_PARALLEL_GROUP = None +_TENSOR_PARALLEL_RANKS = None +_TENSOR_PARALLEL_GROUP = None +_DATA_PARALLEL_GROUP_GLOO = None +_DATA_PARALLEL_RANKS = None +_GLOBAL_GROUP = None +_LM_HEAD_GROUP = None + + +def setup_model_parallel(distributed_timeout_minutes: int = 30, tp: int = 1): + global _DATA_PARALLEL_SIZE + global _DATA_PARALLEL_GROUP + global _DATA_PARALLEL_RANKS + global _TENSOR_PARALLEL_SIZE + global _TENSOR_PARALLEL_RANKS + global _TENSOR_PARALLEL_GROUP + + os.environ["MASTER_ADDR"] = "localhost" + local_rank = int(os.getenv("LOCAL_RANK", '0')) + world_size = int(os.getenv("WORLD_SIZE", '1')) + torch_npu.npu.set_device(local_rank) + tp_size = tp + dp_size = world_size // tp_size + _DATA_PARALLEL_SIZE = dp_size + _TENSOR_PARALLEL_SIZE = tp_size + + torch.set_num_threads(8) + timeout = timedelta(minutes=distributed_timeout_minutes) + print(f"start to init process group ------rank is {local_rank}, world_size is {world_size}") + torch.distributed.init_process_group( + backend='hccl', + world_size=world_size, rank=local_rank + ) + print(f"init process group success ------rank is {local_rank}, world_size is {world_size}") + + rank = torch.distributed.get_rank() + nccl_comm_cfgs = {} + + for dp_group_id in range(tp_size): + ranks = list(range(dp_group_id, world_size, tp_size)) + dp_group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('dp', nccl_comm_cfgs) + ) + if rank in ranks: + global _DATA_PARALLEL_GROUP + _DATA_PARALLEL_GROUP = dp_group + _DATA_PARALLEL_RANKS = ranks + + for tp_group_id in range(dp_size): + start_rank = tp_group_id * tp_size + end_rank = (tp_group_id + 1) * tp_size + ranks = list(range(start_rank, end_rank)) + tp_group = torch.distributed.new_group( + ranks, timeout=timeout, pg_options=get_nccl_options('tp', nccl_comm_cfgs) + ) + if rank in ranks: + global _TENSOR_PARALLEL_GROUP + _TENSOR_PARALLEL_GROUP = tp_group + _TENSOR_PARALLEL_RANKS = ranks + + torch.manual_seed(1) + return local_rank, world_size + + +def get_tensor_parallel_size(): + assert _TENSOR_PARALLEL_SIZE is not None, "tensor parallel size is not set" + return _TENSOR_PARALLEL_SIZE + + +def get_tensor_parallel_group(): + assert _TENSOR_PARALLEL_GROUP is not None, "tensor parallel group is not initialized" + return _TENSOR_PARALLEL_GROUP + + +def get_tensor_parallel_ranks(): + assert _TENSOR_PARALLEL_RANKS is not None, "tensor parallel ranks is not initialized" + return _TENSOR_PARALLEL_RANKS + + +def get_data_parallel_size(): + assert _DATA_PARALLEL_SIZE is not None, "data parallel size is not initialized" + return _DATA_PARALLEL_SIZE + + +def get_data_parallel_gloo(): + assert _DATA_PARALLEL_GROUP_GLOO is not None, "data parallel gloo group is not initialized" + return _DATA_PARALLEL_GROUP_GLOO + + +def get_data_parallel_group(): + assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized" + return _DATA_PARALLEL_GROUP + + +def get_data_parallel_ranks(): + assert _DATA_PARALLEL_RANKS is not None, "data parallel ranks is not initialized" + return _DATA_PARALLEL_RANKS + + +def get_global_group(): + assert _GLOBAL_GROUP is not None, "global group is not initialized" + return _GLOBAL_GROUP + + +def get_nccl_options(pg_name, nccl_comm_cfgs): + if pg_name in nccl_comm_cfgs: + nccl_options = torch.distributed.ProcessGroupNCCL.Options() + nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get('cga_cluster_size', 4) + nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get('max_ctas', 32) + nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get('min_ctas', 1) + return nccl_options + else: + return None + + +def get_safetensors_cut_weight(name: str, weights: torch.Tensor): + translate_col_cut_tensors = ["ffn_down", "attn_output"] + translate_row_cut_tensors = ["ffn_gate", "ffn_up", "attn_q_b"] + translate_lm_cut_tensor = ["output"] + tp = get_tensor_parallel_size() + if tp == 1 or weights.shape == torch.Size([1]): + return weights + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + rank %= tp + assert 0 <= rank < tp and tp > 0, f"unexpected {rank=}, {tp=}" + if any(t in name for t in translate_col_cut_tensors): + if weights.dim() == 1: + return weights + dim = weights.shape[-1] + assert dim % tp == 0, f"unexpected division {dim=}, {tp=}" + chunk_size = dim // tp + output_weights = weights[:, rank * chunk_size: (rank + 1) * chunk_size] + return output_weights + elif any(t in name for t in translate_row_cut_tensors): + dim = weights.shape[0] + assert dim % tp == 0, f"unexpected division {dim=}, {tp=}" + chunk_size = dim // tp + output_weights = weights[rank * chunk_size: (rank + 1) * chunk_size:] + return output_weights + elif (tp > 1) and (any(t in name for t in translate_lm_cut_tensor)): + dim = weights.shape[0] + assert dim % tp == 0, f"unexpected division {dim=} {world_size=}" + chunk_size = dim // tp + output_weights = weights[rank * chunk_size: (rank + 1) * chunk_size:] + return output_weights + else: + return weights + + +def get_absort_weight(model, config): + local_rank = torch.distributed.get_rank() + tp = get_tensor_parallel_size() + local_rank %= tp + tp_heads = config.num_attention_heads // tp + for i in range(config.num_hidden_layers): + self = model.model.layers[i].self_attn + if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')): + kv_b_proj = self.kv_b_proj.weight.view(config.num_attention_heads, -1, self.kv_lora_rank) + q_absorb = kv_b_proj[:, :self.qk_nope_head_dim, :].clone() + out_absorb = kv_b_proj[:, self.qk_nope_head_dim:, :].clone() + q_absorb = q_absorb[local_rank * tp_heads: (local_rank + 1) * tp_heads, :, :].contiguous() + out_absorb = out_absorb[local_rank * tp_heads: (local_rank + 1) * tp_heads, :, :].contiguous() + out_absorb = out_absorb.transpose(1, 2).contiguous() + setattr(self, "q_absorb", q_absorb) + setattr(self, "out_absorb", out_absorb) + del self.orig_module.kv_b_proj + + +def allreduce_wrapper(func): + def wrapper(*args, **kwargs): + orig_output = func(*args, **kwargs) + if isinstance(orig_output, tuple): + if get_tensor_parallel_size() > 1: + org_dtype = orig_output[0].dtype + if org_dtype == torch.bfloat16: + dist.all_reduce(orig_output[0].to(dtype=torch.float16), op=dist.ReduceOp.SUM, + group=get_tensor_parallel_group()) + else: + dist.all_reduce(orig_output[0], op=dist.ReduceOp.SUM, group=get_tensor_parallel_group()) + if org_dtype == torch.bfloat16: + bf_orig_output = orig_output[0].to(dtype=org_dtype) + else: + bf_orig_output = orig_output[0] + else: + bf_orig_output = orig_output[0] + return (bf_orig_output,) + orig_output[1:] + else: + if get_tensor_parallel_size() > 1: + org_dtype = orig_output.dtype + if org_dtype == torch.bfloat16: + orig_output = orig_output.to(dtype=torch.float16) + dist.all_reduce(orig_output, op=dist.ReduceOp.SUM, group=get_tensor_parallel_group()) + if org_dtype == torch.bfloat16: + orig_output = orig_output.to(dtype=org_dtype) + return orig_output + + return wrapper \ No newline at end of file diff --git a/ktransformers/util/custom_loader.py b/ktransformers/util/custom_loader.py index 003f93c..0ad028d 100644 --- a/ktransformers/util/custom_loader.py +++ b/ktransformers/util/custom_loader.py @@ -7,10 +7,19 @@ from typing import Sequence import os from enum import IntEnum import torch -if not torch.xpu.is_available(): + +try: + import torch_npu + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + + +if not torch.xpu.is_available() and not use_torch_npu: import KTransformersOps from safetensors import safe_open -from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant +if not use_torch_npu: + from ktransformers.ktransformers_ext.triton.fp8gemm import fp8_gemm, act_quant, weight_dequant from ktransformers.util.custom_gguf import * from safetensors.torch import save_file from abc import ABC, abstractmethod @@ -42,6 +51,7 @@ class SafeTensorLoader(ModelLoader): tensor_device_map: dict def __init__(self, file_path: str): + self.__load_tensor_file_map(file_path) def __load_tensor_file_map(self, file_path: str): @@ -84,6 +94,7 @@ class SafeTensorLoader(ModelLoader): # if not found_safetensor: # raise FileNotFoundError(f"No Safetensor files found in {folder_path}") + def load_tensor(self, key: str, device: str="cpu"): if translate_name_to_gguf(key) in self.tensor_file_map: key = translate_name_to_gguf(key) @@ -96,6 +107,7 @@ class SafeTensorLoader(ModelLoader): if f is None: raise FileNotFoundError(f"File {file} not found in Safetensor files") tensor = f.get_tensor(key) + return tensor.to(device) def load_experts(self, key: str, device: str="cpu"): @@ -252,20 +264,57 @@ class SafeTensorLoader(ModelLoader): def has_tensor(self, name: str): return name in self.tensor_file_map or translate_name_to_gguf(name) in self.tensor_file_map + +class W8A8SafeTensorLoader(SafeTensorLoader): + def load_tensor(self, key: str, device: str = "cpu"): + if key not in self.tensor_file_map: + raise KeyError(f"Key {key} not found in Safetensor files") + file = self.tensor_file_map[key] + f = self.file_handle_map.get(file) + if f is None: + raise FileNotFoundError(f"File {file} not found in Safetensor files") + tensor = f.get_tensor(key) + if "deq_scale" in key: + tensor = torch.from_numpy( + np.frombuffer(tensor.to(torch.float16).to(torch.float32).numpy().tobytes(), dtype=np.int32).astype(np.int64)) + if "input_scale" in key: + tensor = tensor.to(torch.float16) + if "weight_scale" in key or "weight_offset" in key: + if "ffn" in key: + tensor = tensor.to(torch.float32) + else: + tensor = tensor.to(torch.float16) + if "input_offset" in key: + tensor = tensor.to(torch.int8) + if tensor.dtype == torch.bfloat16: + tensor = tensor.to(torch.float16) + return tensor.to(device) + + def load_dequantized_tensor(self, key: str, device: str = "cpu"): + tensor = self.load_tensor(key, device) + return tensor + class GGUFLoader(ModelLoader): tensor_info: dict gguf_path: str tensor_file_map: dict # {tensor_name: tensor_file_path} gguf_file_meta: dict safetensor_loader: SafeTensorLoader - def __init__(self, gguf_path: str): + def __init__(self, gguf_path: str, quantize: str = None): # Check dir exist if not os.path.exists(gguf_path): raise FileNotFoundError(f"GGUF dir not found: {gguf_path}") if os.path.isfile(gguf_path): gguf_path = os.path.dirname(gguf_path) - self.safetensor_loader = None + safetensor_loader = SafeTensorLoader(gguf_path) + if quantize == "w8a8_dynamic": + safetensor_loader = W8A8SafeTensorLoader(gguf_path) + else: + safetensor_loader = SafeTensorLoader(gguf_path) + if safetensor_loader.tensor_file_map: + self.safetensor_loader = safetensor_loader + return self.tensor_info = {} self.gguf_path = gguf_path diff --git a/ktransformers/util/npu_graph.py b/ktransformers/util/npu_graph.py new file mode 100644 index 0000000..4a0961d --- /dev/null +++ b/ktransformers/util/npu_graph.py @@ -0,0 +1,77 @@ +import time + +import torch +import torch_npu +import sys +import os + +from ktransformers.util.utils import USE_NPU_GRAPH +if USE_NPU_GRAPH: + CAPTURE_PLUGIN_PATH = os.environ.get("CAPTURE_PLUGIN_PATH") + if CAPTURE_PLUGIN_PATH is None: + raise RuntimeError("env CAPTURE_PLUGIN_PATH not exist") + + sys.path.append(CAPTURE_PLUGIN_PATH) + + from libgraph_capture import graph_capture_init + from libgraph_capture import graph_capture_destroy + from libgraph_capture import graph_capture_begin + from libgraph_capture import graph_capture_end + from libgraph_capture import graph_capture_replay + from libgraph_capture import graph_capture_launch_callback + + +class NpuGraph: + def init(self): + ret = graph_capture_init() + if ret != 0: + exit() + + def destroy(self): + ret = graph_capture_destroy() + if ret != 0: + exit() + + def capture_begin( + self, + stream, + capture_error_mode="global"): + torch.npu.synchronize() + torch.npu.empty_cache() + ret = graph_capture_begin(stream, capture_error_mode) + if ret != 0: + exit() + + def capture_end( + self, + stream): + ret = graph_capture_end(stream) + if ret != 0: + exit() + + def replay( + self, + stream): + ret = graph_capture_replay(stream) + if ret != 0: + exit() + + def launch_callback(self, func, data, block, stream): + graph_capture_launch_callback(func, data, block, stream) + + +class graph: + def __init__( + self, + npu_graph: NpuGraph, + pool, + stream, + capture_error_mode: str = "global"): + self.npu_graph = npu_graph + self.stream = stream.npu_stream + + def __enter__(self): + self.npu_graph.capture_begin(self.stream) + + def __exit__(self, exc_type, exc_val, exc_tb): + self.npu_graph.capture_end(self.stream) \ No newline at end of file diff --git a/ktransformers/util/npu_graph_runner.py b/ktransformers/util/npu_graph_runner.py new file mode 100644 index 0000000..c3169e5 --- /dev/null +++ b/ktransformers/util/npu_graph_runner.py @@ -0,0 +1,218 @@ +''' +Description : +Author : Boxin Zhang +Version : 0.1.0 +Copyright (c) 2024 by KVCache.AI, All Rights Reserved. +''' +from typing import Dict + +import acl +import torch +import torch_npu +from torch import nn + +import ktransformers.util.npu_graph as npu_graph +from ktransformers.util.utils import CUR_DEVICE + + +class NPUGraphRunner: + def __init__(self, deviceId): + torch.npu.set_compile_mode(jit_compile=False) + self.deviceId = deviceId + self.enable = False + self.debug = False + self.input_buffers: Dict[str, torch.Tensor] = {} + self.output_buffers: Dict[str, torch.Tensor] = {} + self.tid = None + self.past_key_value = None + + def init(self, batch_size, seq_length): + self.tmp_g = npu_graph.NpuGraph() + self.graph = torch.npu.NPUGraph() + self.main_stream = torch_npu.npu.Stream(device=self.deviceId) + self.update_stream = torch_npu.npu.Stream(device=self.deviceId) + self.stream = self.main_stream.npu_stream + self.logits = torch.zeros((batch_size, seq_length, 7168), dtype=torch.float16).to(self.deviceId) + self.context, ret = acl.rt.get_context(self.deviceId) + if ret != 0: + print("get_context failed! ret: " + str(ret)) + exit(-1) + self.exit_flag = False + self.handle = [] + self.ifa_param = [] + self.event = [] + self.first_update = True + self.workspace = None + + if self.tid is None: + def process_callback(args_list): + ins = args_list[0] + ret = acl.rt.set_context(ins.context) + if ret != 0: + print("set_context failed! ret: " + str(ret)) + exit(-1) + + while True: + acl.rt.process_report(1) + if ins.exit_flag: + break + + self.tid, ret = acl.util.start_thread(process_callback, [self]) + if ret != 0: + print("start_thread failed!") + exit(-1) + + ret = acl.rt.subscribe_report(self.tid, self.stream) + if ret != 0: + print("subscribe_report failed!") + exit(-1) + + def destroy(self): + print(f'[rank:{torch.distributed.get_rank()}]------------- NPU Graph Destroy Begin -------------\n', end='') + self.exit_flag = True + ret = acl.rt.unsubscribe_report(self.tid, self.stream) + if ret != 0: + print("unsubscribe_report failed!") + exit(-1) + self.enable = False + ret = acl.util.stop_thread(self.tid) + if ret != 0: + print("stop_thread failed!") + exit(-1) + self.tid = None + self.workspace = None + self.handle = [] + self.ifa_param = [] + self.event = [] + self.first_update = True + del self.graph + self.tmp_g.destroy() + destroy_runner(self.deviceId) + print(f'[rank:{torch.distributed.get_rank()}]------------- NPU Graph Destroy Finish -------------\n', end='') + + def capture( + self, + model, + cur_token, + position_ids, + cache_position, + past_key_values, + main_device, + **kwargs, + ) -> None: + print(f'[rank:{torch.distributed.get_rank()}]------------- NPU Graph Capture Begin -------------\n', end='') + self.enable = True + self.model = model + inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(main_device) + self.seq_length = inputs_embeds.size()[1] + self.main_device = main_device + with torch.no_grad(): + with torch.npu.graph(self.graph, stream=self.main_stream): + self.logits = model(inputs_embeds=inputs_embeds, + position_ids=position_ids, + cache_position=cache_position, + past_key_values=past_key_values, + **kwargs)[0] + + if past_key_values != None: + past_key_values.change_seq_length(-1) + + self.input_buffers = { + "inputs_embeds": inputs_embeds, + "position_ids": position_ids, + "cache_position": cache_position, + } + + self.output_buffers = {"logits": self.logits} + print(f'[rank:{torch.distributed.get_rank()}]------------- NPU Graph Capture Finish -------------\n', end='') + return + + def forward( + self, + inputs_embeds, + position_ids, + cache_position, + ) -> torch.Tensor: + def ifa_update_sync(param): + with torch.npu.stream(self.update_stream): + for i in range(len(self.handle)): + if self.first_update is False: + q_nope, kvCache, q_pe, kRopeCache, num_heads, \ + softmax_scale, layer_idx, attn_output, softmax_lse = self.ifa_param[i] + torch.npu.graph_task_update_begin(self.update_stream, self.handle[i]) + torch_npu.npu_fused_infer_attention_score.out( + q_nope, + kvCache, + kvCache, + workspace=self.workspace, + query_rope=q_pe, + key_rope=kRopeCache, + num_heads=num_heads, + num_key_value_heads=1, + input_layout="BNSD", + atten_mask=None, + scale=softmax_scale, + antiquant_mode=0, + antiquant_scale=None, + block_table=self.past_key_value.page_table_list[layer_idx], + block_size=self.past_key_value.page_size, + actual_seq_lengths_kv=self.past_key_value.position, + out=[attn_output, softmax_lse]) + torch.npu.graph_task_update_end(self.update_stream) + self.event[i].record(self.update_stream) + + self.ifa_update_tid, ret = acl.util.start_thread(ifa_update_sync, [self]) + if ret != 0: + print("start_thread failed!") + exit(-1) + + ret1 = acl.rt.memcpy(self.input_buffers["inputs_embeds"].data_ptr(), inputs_embeds.numel() * 2, + inputs_embeds.data_ptr(), inputs_embeds.numel() * 2, 3) + ret2 = acl.rt.memcpy(self.input_buffers["position_ids"].data_ptr(), position_ids.numel() * 8, + position_ids.data_ptr(), position_ids.numel() * 8, 3) + ret3 = acl.rt.memcpy(self.input_buffers["cache_position"].data_ptr(), cache_position.numel() * 8, + cache_position.data_ptr(), cache_position.numel() * 8, 3) + torch_npu.npu.synchronize() + + with torch_npu.npu.stream(self.main_stream): + self.graph.replay() + self.first_update = False + ret = acl.util.stop_thread(self.ifa_update_tid) + if ret != 0: + print("stop_thread failed!") + exit(-1) + else: + self.ifa_update_tid = None + return self.output_buffers["logits"] + + def launch_callback(self, func, data, block, stream): + self.tmp_g.launch_callback(func, data, block, stream) + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +runner_dict = dict() + + +def check_runner(deviceId: int): + runner = runner_dict.get(deviceId) + if runner is None: + return True + else: + return False + + +def destroy_runner(deviceId: int): + runner = runner_dict.get(deviceId) + if runner is not None: + runner_dict[deviceId] = None + + +def get_or_create_runner(deviceId: int): + runner = runner_dict.get(deviceId) + + if runner is None: + runner = NPUGraphRunner(deviceId) + runner_dict[deviceId] = runner + return runner \ No newline at end of file diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py index 98a44f2..5a9d1cf 100644 --- a/ktransformers/util/utils.py +++ b/ktransformers/util/utils.py @@ -31,8 +31,35 @@ if not torch.xpu.is_available(): from ktransformers.operators.flashinfer_wrapper import MLAWrapperSingleton import socket +import os +import re +import torch.distributed as dist +try: + import torch_npu + from ktransformers.util.ascend.ascend_utils import get_tensor_parallel_size + use_torch_npu = torch_npu.npu.is_available() +except: + use_torch_npu = False + + warm_uped = False + +W8A8_ENABLE = False +Q4_GGUF_LODER = None +USE_NPU_GRAPH = None +WARM_UP_SKIP_CNT = [1, 1] +_USE_NPU_GRAPH = False +_MAX_DECODE_PROFILE = 3 +CUR_DEVICE = None +_MAX_CHUNK_SIZE = int(max(os.getenv("_MAX_CHUNK_SIZE", 4096), 512)) + + +def get_use_npu_graph(): + assert _USE_NPU_GRAPH is not None, "use npu graph is not setting" + return _USE_NPU_GRAPH + + def get_free_ports(n: int, continue_prot: list): sockets = [] ports = [] @@ -50,6 +77,10 @@ def get_free_ports(n: int, continue_prot: list): return ports def get_compute_capability(device:torch.device = None): + + if use_torch_npu: + return 0 + if torch.cuda.is_available(): if device is None: num_gpus = torch.cuda.device_count() @@ -97,9 +128,16 @@ def get_all_used_cuda_device(device_map:dict): all_device_list.add(device_map[key]["prefill_device"]) if "prefill_device" in device_map[key] else None if "cpu" in all_device_list: all_device_list.remove("cpu") + + if use_torch_npu: + all_device_list = set([device.replace("cuda", "npu") for device in all_device_list]) + all_device_list = list(all_device_list) return all_device_list + + +# TODO: support NPU def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str = "", device="cuda"): prefix = prefix.replace("orig_module.", "") persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set} @@ -109,6 +147,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str key = prefix + name translated_key = key + # TODO: Merge all loader. # I know this is ugly but lets do it for now. if isinstance(gguf_loader, SafeTensorLoader): @@ -120,7 +159,13 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: ModelLoader, prefix: str if gguf_loader.has_tensor(translated_key) or "kv_b_proj" in translated_key: target_dtype = torch.get_default_dtype() device = get_device(translated_key[:translated_key.rfind(".")], gguf_loader.tensor_device_map) - print(f"loading {translated_key} to {device}") + + + if use_torch_npu: + device = "cpu" if "embd" in translated_key else CUR_DEVICE + print(f"loading layer {translated_key} to {device}") if torch.distributed.get_rank() == 0 else None + else: + print(f"loading {translated_key} to {device}") if torch.cuda.is_available(): torch.cuda.empty_cache() elif torch.xpu.is_available(): @@ -149,6 +194,8 @@ def sync_all_device(all_device_list): torch.cuda.synchronize(device) elif "xpu" in device.lower(): torch.xpu.synchronize(device) + elif use_torch_npu: + torch_npu.synchronize(device) else: raise RuntimeError("The device {} is not available".format(device)) @@ -228,20 +275,68 @@ def tf_logits_warper(generation_config): def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cuda_graph: bool = True, mode = 'normal', force_think: bool = False, chunk_size = 16384, use_flashinfer_mla = False, - num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None): + num_heads = None, head_dim_ckv = None, head_dim_kpe = None, q_head_dim = None, static_cache = None): import os os.environ["TOKENIZERS_PARALLELISM"] = "false" torch._dynamo.config.suppress_errors = True batch_size, seq_length = inputs.shape device_map = model.gguf_loader.tensor_device_map - torch_device = get_device('model.layers.0.self_attn', device_map) - torch_device = torch_device_mapping[torch_device] if torch_device in torch_device_mapping else torch_device + + if use_torch_npu: + vocabulary_size = model.config.vocab_size + topp = torch.tensor([[model.generation_config.top_p]], dtype=torch.float16).npu() + topk = torch.tensor([[model.generation_config.top_k]], dtype=torch.int32).npu() + temperature = torch.tensor([[model.generation_config.temperature]], dtype=torch.float16).npu() + next_token_fake = torch.tensor([[1]], dtype=torch.int32).npu() + next_token_probs = torch.tensor([[1.0]], dtype=torch.float16).npu() + torch_device = CUR_DEVICE + else: + torch_device = get_device('model.layers.0.self_attn', device_map) + torch_device = torch_device_mapping[torch_device] if torch_device in torch_device_mapping else torch_device inputs = inputs.to(torch_device) all_cuda_device = get_all_used_cuda_device(device_map) tokens = [] + def decode_one_tokens_npu(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True): + if cuda_graph_runner is None: + use_cuda_graph = False + inputs_embeds = model.model.embed_tokens(cur_token.to('cpu')).to(torch_device) + if use_cuda_graph: + logits = cuda_graph_runner(inputs_embeds, position_ids, cache_position) + else: + # custom_stream = torch.cuda.Stream() + # torch.cuda.set_device(torch_device) + torch_npu.npu.set_device(torch_device) + # with torch.cuda.stream(custom_stream): + logits=model(inputs_embeds=inputs_embeds, + position_ids=position_ids, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, use_cache=True)[0] + if past_key_values != None: + past_key_values.change_seq_length(1) + all_cuda_device = ['npu:' + str(index) for index in range(torch.distributed.get_world_size())] + for device in all_cuda_device: + # torch.cuda.synchronize(device) + torch_npu.npu.synchronize(device) + if generation_config.do_sample: + logits = logits / temperature + torch.manual_seed(0) + probs = logits.view(batch_size, vocabulary_size) + sm = nn.Softmax(dim=-1) + probs = sm(probs).half().npu() + next_token = next_token_fake + torch_npu._npu_topk_topp_sampling(probs, topk, topp, next_token, next_token_probs) + next_token = next_token.squeeze(-1) + else: + next_token_scores = logits_warper(inputs, logits[:, -1, :]) + next_token = torch.argmax(next_token_scores, dim=-1) + return next_token + def decode_one_tokens(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph: bool = True): + if use_torch_npu: + return decode_one_tokens_npu(cuda_graph_runner, cur_token, position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph) if cuda_graph_runner is None: use_cuda_graph = False if use_cuda_graph: @@ -252,6 +347,8 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud torch.cuda.set_device(torch_device) elif torch.xpu.is_available(): torch.xpu.set_device(torch_device) + elif use_torch_npu: + torch_npu.set_device(torch_device) else: raise RuntimeError(f"The device: {torch_device} is not available") inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to(torch_device) @@ -279,6 +376,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud inputs_embeds = model.model.embed_tokens(inputs.to("cpu")) else: inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to(torch_device) + if use_flashinfer_mla: MLAWrapperSingleton.update_buffer(past_key_values.max_pages) MLAWrapperSingleton.need_plan_all() @@ -288,11 +386,88 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud )[0][:,-1,:].unsqueeze(0).clone().to(torch_device) return logits - + + def decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length, prof=None): + global warm_uped + global _USE_NPU_GRAPH + if use_cuda_graph: + from ktransformers.util.npu_graph_runner import get_or_create_runner + npu_graph_runner = get_or_create_runner(CUR_DEVICE) + npu_graph_runner.init(batch_size, seq_length) + with torch_npu.npu.stream(npu_graph_runner.main_stream): + for i in range(1, max_new_tokens): + if use_flashinfer_mla: + MLAWrapperSingleton.plan_all(None, None, None, position_ids.squeeze(1) + 1, None, + num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size, + model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, + torch.bfloat16) + if use_cuda_graph and ((warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2)): + warm_uped = True + _USE_NPU_GRAPH = True + npu_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True) + cuda_graph_runner = npu_graph_runner + + next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, + cache_position, past_key_values, logits_warper, generation_config, + use_cuda_graph).to(torch_device) + inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) + generated_ids[:, cache_position] = next_token.int() + tokens.append(int(next_token)) + seq_length += 1 + + if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode( + next_token.tolist()) == '<|im_end|>': + print(stream.end(), end="", flush=True) + break + else: + if torch.distributed.get_rank() % get_tensor_parallel_size() == 0: + print(stream.put(next_token.item()), end="", flush=True) + cache_position += 1 + past_key_values.position[0] += 1 + position_ids = cache_position.unsqueeze(0) + + if prof is not None: + prof.step() + npu_graph_runner.destroy() + _USE_NPU_GRAPH = False + else: + for i in range(1, max_new_tokens): + if use_flashinfer_mla: + MLAWrapperSingleton.plan_all(None, None, None, position_ids.squeeze(1) + 1, None, + num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size, + model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, + torch.bfloat16) + next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, + past_key_values, logits_warper, generation_config, use_cuda_graph).to( + torch_device) + inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) + generated_ids[:, cache_position] = next_token.int() + tokens.append(int(next_token)) + seq_length += 1 + + if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode( + next_token.tolist()) == '<|im_end|>': + print(stream.end(), end="", flush=True) + break + else: + if torch.distributed.get_rank() % get_tensor_parallel_size() == 0: + print(stream.put(next_token.item()), end="", flush=True) + cache_position += 1 + past_key_values.position[0] += 1 + position_ids = cache_position.unsqueeze(0) + + if prof is not None: + prof.step() + if prof is not None: + prof.stop() + + if torch.cuda.is_available(): torch.cuda.set_device(torch_device) elif torch.xpu.is_available(): torch.xpu.set_device(torch_device) + elif use_torch_npu: + torch_npu.set_device(torch_device) else: raise RuntimeError(f"The device: {torch_device} is not available") with torch.no_grad(): @@ -304,6 +479,16 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud past_key_values = DynamicUnbalancedFp8Cache.from_legacy_cache(None) else: past_key_values = DynamicNormalCache.from_legacy_cache(None) + elif use_torch_npu and static_cache: + assert isinstance(static_cache, StaticCache), '[ERROR] static_cache format not equal to StaticCache' + past_key_values = static_cache + if past_key_values.max_batch_size < batch_size or past_key_values.max_cache_len < seq_length + max_new_tokens: + print('[WARN] current staticCache size exceeded, try create new staticCache...') + past_key_values = StaticCache( + config=model.config, max_batch_size=1, max_cache_len=seq_length + max_new_tokens, device=device_map, dtype=model.dtype + ) + else: + past_key_values.reset() elif mode != 'long_context': past_key_values = StaticCache( config = model.config, max_batch_size = 1, max_cache_len = seq_length + max_new_tokens, device = device_map, dtype = model.dtype @@ -320,19 +505,67 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud logits_warper = tf_logits_warper(generation_config) cache_position = torch.arange(seq_length, device=torch_device, dtype=torch.int32) + if use_torch_npu: + past_key_values.position[0] = seq_length + 1 + generated_ids = torch.zeros( batch_size, seq_length + max_new_tokens + 1, dtype=torch.int, device=torch_device ) generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int) start_time = time.time() - chunk_start = 0 - while chunk_start < seq_length: - chunk_end = min(chunk_start + chunk_size, seq_length) - if past_key_values != None: - past_key_values.cur_idx=cache_position[chunk_start:chunk_end] - logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values) - chunk_start += chunk_size + logits = None + + def prefill_wrapper(prof=None): + nonlocal logits + chunk_start = 0 + while chunk_start < seq_length: + chunk_end = min(chunk_start + chunk_size, seq_length) + if past_key_values != None: + past_key_values.cur_idx=cache_position[chunk_start:chunk_end] + logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values) + chunk_start += chunk_size + if prof is not None: + prof.step() + if prof is not None: + prof.stop() + if logits is None: + raise ValueError('logits cannot be None') + + if use_torch_npu: + global WARM_UP_SKIP_CNT + prof_prefill = os.environ["PROF_PREFILL"] if "PROF_PREFILL" in os.environ else "0" + if prof_prefill == "1" and WARM_UP_SKIP_CNT[0] <= 0: + experimental_config = torch_npu.profiler._ExperimentalConfig( + aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False + ) + with torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.CPU, + torch_npu.profiler.ProfilerActivity.NPU + ], + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=8, repeat=1, skip_first=0), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./prefill_prof"), + record_shapes=True, + profile_memory=True, + with_stack=False, + with_flops=False, + with_modules=False, + experimental_config=experimental_config) as prof: + prefill_wrapper(prof) + else: + prefill_wrapper() + WARM_UP_SKIP_CNT[0] -= 1 + else: + + chunk_start = 0 + while chunk_start < seq_length: + chunk_end = min(chunk_start + chunk_size, seq_length) + if past_key_values != None: + past_key_values.cur_idx=cache_position[chunk_start:chunk_end] + logits = chunk_prefill(inputs[:, chunk_start:chunk_end], cache_position[chunk_start:chunk_end], past_key_values) + chunk_start += chunk_size next_token_scores = logits_warper(inputs, logits[:, -1, :]) if generation_config.do_sample: @@ -348,56 +581,106 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000, use_cud prefill_count = seq_length prefill_time = first_token_time - if force_think: - print("") - print(stream.put(next_token.item()), end="", flush=True) + if use_torch_npu and torch.distributed.get_rank() % get_tensor_parallel_size() == 0: + if force_think: + print("") + print(stream.put(next_token.item()), end="", flush=True) + elif not use_torch_npu: + if force_think: + print("") + print(stream.put(next_token.item()), end="", flush=True) + generated_ids[:, seq_length] = next_token tokens.append(int(next_token)) inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) cache_position = torch.tensor([seq_length], device=torch_device, dtype=torch.int32) position_ids = cache_position.unsqueeze(0) seq_length += 1 + if use_torch_npu: + past_key_values.position += 1 cuda_graph_runner = None start_time = time.time() - for i in range(1, max_new_tokens): - if use_flashinfer_mla: - MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None, - num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size, - model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16) - global warm_uped - if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ): - warm_uped = True - cuda_graph_runner = CUDAGraphRunner() - cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True) - next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device) - inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) - generated_ids[:, cache_position] = next_token.int() - tokens.append(int(next_token)) - seq_length += 1 - - if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>': - print(stream.end(), end="", flush=True) - break + + if not use_torch_npu: + for i in range(1, max_new_tokens): + if use_flashinfer_mla: + MLAWrapperSingleton.plan_all(None,None,None,position_ids.squeeze(1)+1,None, + num_heads, head_dim_ckv, head_dim_kpe, past_key_values.page_size, + model.model.layers[0].self_attn.softmax_scale, torch.bfloat16, torch.bfloat16) + global warm_uped + if use_cuda_graph and ( (warm_uped == True and int(i) == 1) or (warm_uped == False and int(i) == 2) ): + warm_uped = True + cuda_graph_runner = CUDAGraphRunner() + cuda_graph_runner.capture(model, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, torch_device, return_dict=False, use_cache=True) + next_token = decode_one_tokens(cuda_graph_runner, next_token.unsqueeze(0), position_ids, cache_position, past_key_values, logits_warper, generation_config, use_cuda_graph).to(torch_device) + inputs = torch.cat((inputs, next_token.unsqueeze(0)), dim=-1) + generated_ids[:, cache_position] = next_token.int() + tokens.append(int(next_token)) + seq_length += 1 + + if next_token[0].item() == tokenizer.eos_token_id or tokenizer.decode(next_token.tolist()) == '<|im_end|>': + print(stream.end(), end="", flush=True) + break + else: + print(stream.put(next_token.item()), end="", flush=True) + cache_position += 1 + position_ids = cache_position.unsqueeze(0) + else: + prof_decode = os.environ["PROF_DECODE"] if "PROF_DECODE" in os.environ else "0" + prof_ranks = os.environ["PROF_RANK"] if "PROF_RANK" in os.environ else "0" + prof_ranks = [int(r.strip()) for r in prof_ranks.split(",")] + if prof_decode == "1" and torch.distributed.get_rank() in prof_ranks and WARM_UP_SKIP_CNT[1] <= 0: + experimental_config = torch_npu.profiler._ExperimentalConfig( + aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False + ) + with torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.CPU, + torch_npu.profiler.ProfilerActivity.NPU + ], + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=_MAX_DECODE_PROFILE, repeat=1, skip_first=0), + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./decode_prof"), + record_shapes=True, + profile_memory=True, + with_stack=False, + with_flops=False, + with_modules=False, + experimental_config=experimental_config) as prof: + decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length, prof) else: - print(stream.put(next_token.item()), end="", flush=True) - cache_position += 1 - position_ids = cache_position.unsqueeze(0) + decode_wrapper(next_token, position_ids, cache_position, cuda_graph_runner, past_key_values, inputs, seq_length) + WARM_UP_SKIP_CNT[1] -= 1 total_time = time.time() - start_time tokens_generated = len(tokens) tokens_per_second = tokens_generated / total_time - print("") + if not use_torch_npu: + print("") + + print(f"prompt eval count: {prefill_count} token(s)") + print(f"prompt eval duration: {prefill_time}s") + print(f"prompt eval rate: {prefill_count/prefill_time} tokens/s") + print(f"eval count: {tokens_generated} token(s)") + print(f"eval duration: {total_time}s") + print(f"eval rate: {tokens_per_second} tokens/s") + else: + tp_size = get_tensor_parallel_size() + if torch.distributed.get_rank() % tp_size == 0: + rank = f"[rank:{torch.distributed.get_rank()}]" + msg = f"\n{rank} Eval Time\n" + msg += rank + f"prompt eval count: {prefill_count} token(s)\n" + msg += rank + f"prompt eval duration: {prefill_time:.9f}s\n" + msg += rank + f"prompt eval rate: {prefill_count/prefill_time:.9f} tokens/s\n" + msg += rank + f"eval count: {tokens_generated} token(s)\n" + msg += rank + f"eval duration: {total_time:.9f}s\n" + msg += rank + f"eval rate: {tokens_per_second:.9f} tokens/s\n" + print(msg) - print(f"prompt eval count: {prefill_count} token(s)") - print(f"prompt eval duration: {prefill_time}s") - print(f"prompt eval rate: {prefill_count/prefill_time} tokens/s") - print(f"eval count: {tokens_generated} token(s)") - print(f"eval duration: {total_time}s") - print(f"eval rate: {tokens_per_second} tokens/s") return tokens diff --git a/merge_tensors/merge_safetensor_gguf.py b/merge_tensors/merge_safetensor_gguf.py index f299ab9..636972e 100644 --- a/merge_tensors/merge_safetensor_gguf.py +++ b/merge_tensors/merge_safetensor_gguf.py @@ -12,6 +12,8 @@ from safetensors.torch import save_file import re from collections import defaultdict +SKIP_MTP = True + def read_safetensor_keys_from_folder(folder_path)->dict: """ :param folder_path: folder path @@ -36,7 +38,7 @@ def read_safetensor_keys_from_folder(folder_path)->dict: try: with safe_open(file_path, framework="pt") as f: for key in f.keys(): - if "model.layers.61" in key: + if SKIP_MTP and "model.layers.61" in key: # skip MTP layer continue # try: @@ -94,6 +96,28 @@ def combine_tensor_sources(safetensor_path:str, gguf_path:str): return target_tensor_map, gguf_loader +def combine_w8a8_tensor_sources(safetensor_path: str, gguf_path: str): + gguf_loader = GGUFLoader(gguf_path) + gguf_tensor_file_map = gguf_loader.tensor_file_map + safetensor_tensor_file_map = read_safetensor_keys_from_folder(safetensor_path) + + # build a map for the key to the tensor + # according to the key, we can get the tensor from the file + + target_tensor_map = {} + for key in safetensor_tensor_file_map.keys(): + # for all experts, we use the gguf tensor + if ".mlp.experts." in key and "weight_scale" not in key and "weight_offset" not in key: + key = '.'.join(key.split('.')[:5] + key.split('.')[-2:]) + translated_key = translate_name(key) + target_tensor_map[key] = gguf_tensor_file_map[translated_key] + elif ".mlp.experts." in key and ("weight_scale" not in key or "weight_offset" not in key): + continue + else: + target_tensor_map[key] = safetensor_tensor_file_map[key] + + return target_tensor_map, gguf_loader + def write_combined_tensor(target_tensor_map: dict, output_path: str, gguf_loader: GGUFLoader): # Ensure output directory exists os.makedirs(output_path, exist_ok=True) @@ -193,6 +217,7 @@ def main(): parser.add_argument("--safetensor_path", type=str, help="Path to the Safetensor file", default="/mnt/data/model/DeepSeek-V3") parser.add_argument("--gguf_path", type=str, help="Path to the GGUF file", default="/mnt/data/model/DeepseekV3-q4km-gguf") parser.add_argument("--output_path", type=str, help="Path to the output file", default="/mnt/data/model/ktrans-safetensors/DeepSeek-V3-q4km-fp8") + parser.add_argument("--safetensors_format", type=str, help="Safetensors format", default="fp8") # print all the arguments print("All the arguments:") @@ -204,8 +229,18 @@ def main(): safetensor_path = args.safetensor_path gguf_path = args.gguf_path output_path = args.output_path + safetensors_format = args.safetensors_format - target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path) + match safetensors_format: + case "w8a8": + global SKIP_MTP + SKIP_MTP = False + target_tensor_map, gguf_loader = combine_w8a8_tensor_sources(safetensor_path, gguf_path) + case "fp8": + target_tensor_map, gguf_loader = combine_tensor_sources(safetensor_path, gguf_path) + case _: + raise ValueError(f"Unsupported safetensors format: {safetensor_path}") + write_combined_tensor(target_tensor_map, output_path, gguf_loader) return diff --git a/setup.py b/setup.py index 84663b9..8313016 100644 --- a/setup.py +++ b/setup.py @@ -673,10 +673,29 @@ if not torch.xpu.is_available() and not KTRANSFORMERS_BUILD_NPU: ext_modules.append( CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve")) ) + + setup( + name=VersionInfo.PACKAGE_NAME, + version=VersionInfo().get_package_version(), + install_requires=triton_dep, + cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, + ext_modules=ext_modules + ) + + + elif torch.xpu.is_available(): ext_modules = [ CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")), ] + setup( + name=VersionInfo.PACKAGE_NAME, + version=VersionInfo().get_package_version(), + install_requires=triton_dep, + cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, + ext_modules=ext_modules + ) + elif KTRANSFORMERS_BUILD_NPU: ext_modules = [ CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")), @@ -687,11 +706,10 @@ elif KTRANSFORMERS_BUILD_NPU: CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve")) ) + setup( + name=VersionInfo.PACKAGE_NAME, + version=VersionInfo().get_package_version(), + cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, + ext_modules=ext_modules + ) -setup( - name=VersionInfo.PACKAGE_NAME, - version=VersionInfo().get_package_version(), - install_requires=triton_dep, - cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild}, - ext_modules=ext_modules -) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index a4e8c41..5acb64b 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -1,4925 +1,7 @@ -// Adapted from -// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc -// Copyrigth 2024 Iwan Kawrakow - Apache 2.0 Licens -// with additions from -// https://github.com/ikawrakow/ik_llama.cpp/blob/main/ggml/src/iqk/iqk_mul_mat.cpp -// Copyrigth 2024-2025 Iwan Kawrakow - MIT Licens -// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. - -// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=cpp fenc=utf-8 :vi -// -// Copyright 2024 Iwan Kawrakow -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// -// Copyright (C) 2024-2025 Iwan Kawrakow -// MIT license -// SPDX-License-Identifier: MIT -// - -#include -#include -#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64) - -#include "llama.cpp/ggml-impl.h" -#include "llama.cpp/ggml-quants.h" -#include "sgemm.h" - -// For i-quants, I had to explicitely specify which -// functions to inline / not inline (at least for some -// of the functions), else performance would be significantly -// lower. This is worrysome as things can change with, -// e.g., a different compiler version or running on a different -// CPU. -#ifdef _MSC_VER -#define IQK_NOINLINE __declspec(noinline) -#define IQK_ALWAYS_INLINE inline +#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU + // 使用 x86 版本 + #include "iqk_mul_mat_arm.inc" #else -#define IQK_NOINLINE __attribute__((__noinline__)) -#define IQK_ALWAYS_INLINE __attribute__((always_inline)) -#endif - -#define GGML_COMMON_IMPL_C -#include "llama.cpp/ggml-common.h" - -// clang-format off - -// This matrix - vector and matrix - matrix multiplication implementation -// for legacy quants, k-quants and i-quants makes prompt processing 150-200% -// (legacy and k-quants) or 250-400% (i-quants) faster. -// compared to mainline llama.cpp (and llamafile). -// It provides implementations for ARM_NEON (all quants) and AVX2 -// (all quants except sub-4 bit i-quants). -// -// Main idea is that unpacking the quants and the block scales to -// be ready for dot products with the corresponding Q8_Y quants -// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type). -// Hence, if we are performing a QX x Q8_Y matrix matrix -// multiplication (as needed for prompt processing), we can get -// a significant speedup by reusing the unpacked QX quants and scales -// for multiplication with several Q8_K columns. We also achieve fewer -// loads from memory, which is the main purpose of tiling in general -// purpose matrix multiplication packages. - -#include -#include - -#endif - -constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast(98); -constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast(99); - - -namespace { - -typedef struct { - int32_t i1; - int32_t i2; -} mmid_row_mapping; - -struct DataInfo { - float * s; - const char * cy; - size_t bs; - size_t by; - int cur_y = 0; - int ne11; - const mmid_row_mapping * row_mapping = nullptr; - size_t bs2 = 0; - - inline const char * src1_row(int iy) const { - if (!row_mapping) return cy + (cur_y + iy)*by; - int i11 = row_mapping[cur_y + iy].i1 % ne11; - int i12 = row_mapping[cur_y + iy].i2; - return cy + (i11 + i12*ne11)*by; - } - - inline void store(int ix, int iy, float result) const { - *(dst_row(iy) + ix) = result; - //dst_row(iy)[ix] = result; - } - inline float * dst_row(int iy) const { - if (!row_mapping) return s + (cur_y + iy)*bs; - int i12 = row_mapping[cur_y + iy].i2; - int i1 = row_mapping[cur_y + iy].i1; - int i2 = i12; - return s + i1*bs + i2*bs2; - } -}; - -/* -moonll -change param for set_mul_mat -add func16 -*/ - -typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x); - -struct MulMat { - std::array funcs = {}; - mul_mat_t func16 = nullptr; - //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { - IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { - constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small) - - // copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L162 - // MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow - if (func16 && nrc_y >= 16) { - int n_step = (nrc_y - info.cur_y)/16; - for (int ix = 0; ix < nrc_x; ix += k_x_step) { - auto this_info = info; - this_info.s += ix; - int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; - for (int iy = 0; iy < n_step; ++iy) { - func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); - this_info.cur_y += 16; - } - } - info.cur_y += 16 * n_step; - if (info.cur_y == nrc_y) return; - } - // end copy - - int n_step = (nrc_y - info.cur_y)/funcs.size(); - if (n_step > 0) { - for (int ix = 0; ix < nrc_x; ix += k_x_step) { - auto this_info = info; - this_info.s += ix; - int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; - for (int iy = 0; iy < n_step; ++iy) { - funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); - this_info.cur_y += funcs.size(); - } - } - info.cur_y += funcs.size() * n_step; - } - int n_left = nrc_y - info.cur_y; - if (n_left > 0) { - funcs[n_left-1](n, vx, bx, info, nrc_x); - } - } - static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny); -private: - template static IQK_NOINLINE void set_functions(MulMat& m); -}; - -inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) { - const uint16_t * scales = (const uint16_t *)scales8; - const uint32_t a0 = scales[0] | (scales[1] << 16); - const uint32_t a1 = scales[2] | (scales[3] << 16); - const uint32_t a2 = scales[4] | (scales[5] << 16); - aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030); - aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030); - aux32[2] = a1 & 0x3f3f3f3f; - aux32[0] = a0 & 0x3f3f3f3f; -} - -/* -moonll -decoding tables -*/ -// copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -#ifdef __AVX2__ -static const uint64_t iq1s_grid_us[2048] = { - 0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200, - 0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000, - 0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101, - 0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101, - 0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202, - 0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200, - 0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001, - 0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202, - 0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201, - 0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001, - 0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101, - 0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101, - 0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202, - 0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200, - 0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201, - 0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002, - 0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101, - 0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200, - 0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102, - 0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101, - 0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001, - 0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100, - 0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200, - 0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101, - 0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100, - 0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000, - 0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202, - 0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200, - 0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101, - 0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201, - 0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002, - 0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001, - 0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001, - 0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002, - 0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000, - 0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101, - 0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000, - 0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101, - 0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202, - 0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201, - 0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000, - 0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100, - 0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102, - 0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002, - 0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000, - 0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101, - 0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101, - 0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200, - 0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002, - 0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001, - 0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101, - 0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101, - 0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101, - 0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102, - 0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100, - 0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002, - 0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100, - 0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000, - 0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101, - 0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101, - 0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001, - 0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102, - 0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201, - 0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202, - 0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001, - 0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001, - 0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101, - 0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102, - 0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200, - 0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, - 0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101, - 0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000, - 0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201, - 0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101, - 0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202, - 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102, - 0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101, - 0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100, - 0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002, - 0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201, - 0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101, - 0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002, - 0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202, - 0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101, - 0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000, - 0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100, - 0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102, - 0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102, - 0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101, - 0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101, - 0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001, - 0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201, - 0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002, - 0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001, - 0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100, - 0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101, - 0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001, - 0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101, - 0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000, - 0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001, - 0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101, - 0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101, - 0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000, - 0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001, - 0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001, - 0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102, - 0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102, - 0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101, - 0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201, - 0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202, - 0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202, - 0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101, - 0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001, - 0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000, - 0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101, - 0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200, - 0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100, - 0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100, - 0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202, - 0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102, - 0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201, - 0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202, - 0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002, - 0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001, - 0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001, - 0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101, - 0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202, - 0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201, - 0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102, - 0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200, - 0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001, - 0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101, - 0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201, - 0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001, - 0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002, - 0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000, - 0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202, - 0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201, - 0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201, - 0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101, - 0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100, - 0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000, - 0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101, - 0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202, - 0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101, - 0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202, - 0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202, - 0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201, - 0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002, - 0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102, - 0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102, - 0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000, - 0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000, - 0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101, - 0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101, - 0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202, - 0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200, - 0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102, - 0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101, - 0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100, - 0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001, - 0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100, - 0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101, - 0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001, - 0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200, - 0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, - 0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101, - 0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100, - 0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101, - 0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101, - 0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101, - 0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202, - 0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100, - 0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201, - 0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202, - 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102, - 0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200, - 0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201, - 0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000, - 0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002, - 0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100, - 0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000, - 0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100, - 0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000, - 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102, - 0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100, - 0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002, - 0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001, - 0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201, - 0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202, - 0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100, - 0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001, - 0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002, - 0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001, - 0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201, - 0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001, - 0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101, - 0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101, - 0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101, - 0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101, - 0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102, - 0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100, - 0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001, - 0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000, - 0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001, - 0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101, - 0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100, - 0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000, - 0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202, - 0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101, - 0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100, - 0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100, - 0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200, - 0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100, - 0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, - 0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101, - 0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201, - 0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001, - 0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201, - 0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201, - 0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001, - 0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200, - 0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100, - 0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201, - 0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200, - 0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, - 0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001, - 0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102, - 0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001, - 0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201, - 0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100, - 0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000, - 0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102, - 0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001, - 0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202, - 0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102, - 0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101, - 0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201, - 0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101, - 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102, - 0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101, - 0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100, - 0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202, - 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, - 0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202, - 0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101, - 0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200, - 0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101, - 0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100, - 0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002, - 0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201, - 0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100, - 0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202, - 0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102, - 0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002, - 0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200, - 0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002, - 0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200, - 0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001, - 0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200, - 0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100, - 0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000, - 0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102, - 0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100, - 0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000, - 0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102, - 0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100, - 0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000, - 0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101, - 0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001, - 0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201, - 0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002, - 0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200, - 0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100, - 0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101, - 0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202, - 0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002, - 0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201, - 0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201, - 0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001, - 0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202, - 0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102, - 0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002, - 0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201, - 0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200, - 0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002, - 0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100, - 0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101, - 0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102, - 0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002, - 0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200, - 0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100, - 0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001, - 0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100, - 0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201, - 0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101, - 0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102, - 0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201, - 0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200, - 0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200, - 0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002, - 0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202, - 0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102, - 0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000, - 0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202, - 0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201, - 0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001, - 0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002, - 0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102, - 0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001, - 0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101, - 0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202, - 0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102, - 0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201, - 0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101, - 0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101, - 0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001, - 0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202, - 0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000, - 0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202, - 0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102, - 0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002, - 0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201, - 0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101, - 0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001, - 0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200, - 0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102, - 0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102, - 0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100, - 0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001, - 0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201, - 0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001, - 0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202, - 0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200, - 0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000, - 0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000, - 0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001, - 0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200, - 0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200, - 0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202, - 0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201, - 0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202, - 0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001, - 0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001, - 0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200, - 0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000, - 0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102, - 0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101, - 0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100, - 0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000, - 0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100, - 0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100, - 0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102, - 0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201, - 0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202, - 0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102, - 0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102, - 0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202, - 0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202, - 0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100, - 0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000, - 0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101, - 0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202, - 0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102, - 0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100, - 0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101, - 0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100, - 0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201, - 0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101, - 0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202, - 0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200, - 0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201, - 0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200, - 0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002, - 0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201, - 0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101, - 0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201, - 0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201, - 0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102, - 0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101, - 0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101, - 0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101, - 0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001, - 0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000, - 0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102, - 0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101, - 0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202, - 0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202, - 0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101, - 0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000, - 0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101, - 0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202, - 0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100, - 0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000, - 0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101, - 0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202, - 0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100, - 0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100, - 0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002, - 0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100, - 0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101, - 0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202, - 0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200, - 0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100, - 0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200, - 0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002, - 0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001, - 0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101, - 0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101, - 0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202, - 0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102, - 0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100, - 0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101, - 0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100, - 0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101, - 0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101, - 0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101, - 0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101, - 0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102, - 0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100, - 0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102, - 0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101, - 0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101, - 0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001, - 0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101, - 0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202, - 0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102, - 0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001, - 0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102, - 0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200, - 0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101, - 0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001, - 0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201, - 0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202, - 0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102, - 0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002, - 0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200, - 0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100, - 0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001, - 0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002, - 0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201, - 0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101, - 0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100, - 0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000, - 0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200, - 0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101, - 0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200, - 0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202, - 0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100, - 0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102, - 0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102, - 0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102, - 0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101, - 0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101, - 0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000, - 0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202, - 0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102, - 0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200, - 0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101, - 0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101, - 0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100, - 0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202, - 0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101, - 0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201, - 0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001, - 0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101, - 0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200, - 0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002, - 0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001, - 0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000, - 0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101, - 0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202, - 0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100, - 0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102, - 0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200, - 0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101, - 0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201, - 0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000, - 0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202, - 0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201, - 0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200, - 0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002, - 0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101, - 0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100, - 0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001, - 0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201, - 0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000, - 0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102, - 0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001, - 0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201, - 0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100, - 0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002, - 0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001, - 0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101, - 0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002, - 0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000, - 0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101, - 0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100, - 0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200, - 0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200, - 0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102, - 0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200, - 0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002, - 0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100, - 0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001, - 0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001, - 0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102, - 0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202, - 0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202, - 0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000, - 0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101, - 0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202, -}; -#else -static const uint32_t iq1s_grid_us[2048] = { - 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000, - 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101, - 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200, - 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212, - 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011, - 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111, - 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220, - 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022, - 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220, - 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101, - 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110, - 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111, - 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010, - 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210, - 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221, - 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021, - 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002, - 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101, - 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101, - 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211, - 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110, - 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022, - 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121, - 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220, - 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001, - 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101, - 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102, - 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012, - 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010, - 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111, - 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122, - 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222, - 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001, - 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102, - 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101, - 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000, - 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101, - 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112, - 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110, - 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211, - 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012, - 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111, - 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120, - 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122, - 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121, - 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221, - 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001, - 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101, - 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101, - 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011, - 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111, - 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011, - 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122, - 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121, - 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222, - 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101, - 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000, - 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200, - 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110, - 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112, - 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222, - 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021, - 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121, - 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201, - 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200, - 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101, - 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011, - 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010, - 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211, - 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121, - 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000, - 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202, - 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202, - 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211, - 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112, - 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020, - 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121, - 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222, - 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102, - 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100, - 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110, - 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011, - 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111, - 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110, - 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121, - 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222, - 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201, - 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102, - 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201, - 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012, - 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010, - 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010, - 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110, - 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011, - 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212, - 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021, - 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021, - 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021, - 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101, - 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101, - 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100, - 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010, - 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111, - 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010, - 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111, - 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120, - 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120, - 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101, - 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001, - 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201, - 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210, - 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211, - 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111, - 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112, - 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211, - 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010, - 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021, - 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122, - 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221, - 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102, - 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100, - 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101, - 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101, - 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101, - 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012, - 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110, - 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112, - 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210, - 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210, - 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210, - 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010, - 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110, - 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122, - 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020, - 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021, - 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022, - 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120, - 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222, - 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221, - 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001, - 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102, - 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201, - 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012, - 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111, - 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012, - 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110, - 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110, - 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121, - 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221, - 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220, - 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222, - 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000, - 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201, - 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012, - 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011, - 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212, - 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221, - 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121, - 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202, - 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202, - 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002, - 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101, - 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210, - 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112, - 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011, - 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011, - 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210, - 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020, - 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220, - 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222, - 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222, - 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001, - 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010, - 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111, - 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010, - 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110, - 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221, - 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122, - 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202, - 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100, - 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101, - 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112, - 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111, - 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211, - 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222, - 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221, - 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022, - 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101, - 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211, - 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111, - 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111, - 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010, - 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121, - 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222, - 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000, - 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202, - 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000, - 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202, - 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110, - 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110, - 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222, - 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120, - 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022, - 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101, - 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202, - 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110, - 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110, - 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111, - 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111, - 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120, - 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121, - 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001, - 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202, - 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001, - 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200, - 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011, - 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212, - 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012, - 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110, - 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012, - 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111, - 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020, - 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121, - 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222, - 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102, - 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102, - 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101, - 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212, - 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210, - 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111, - 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212, - 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221, - 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121, - 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002, - 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000, - 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202, - 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112, - 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111, - 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020, - 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221, - 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022, - 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100, - 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201, - 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112, - 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211, - 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012, - 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121, - 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020, - 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120, - 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200, - 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200, - 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110, - 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011, - 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222, - 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020, - 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222, -}; -#endif -// end copy https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570 - -#ifndef HAVE_FANCY_SIMD -const uint64_t keven_signs[128] = { - 0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff, - 0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff, - 0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff, - 0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff, - 0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff, - 0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff, - 0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff, - 0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff, - 0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff, - 0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff, - 0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff, - 0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff, - 0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff, - 0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff, - 0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff, - 0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff, - 0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff, - 0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff, - 0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff, - 0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff, - 0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff, - 0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff, - 0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff, - 0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff, - 0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff, - 0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff, - 0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff, - 0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff, - 0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff, - 0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff, - 0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff, - 0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff, -}; -#endif - -} - -/* moonll change mulmat -add typeB and strideB -}*/ - -// Adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -bool iqk_mul_mat(long Nx, long Ny, long ne00, - int typeA, const void * A, long strideA, - int typeB, const void * B, long strideB, - float * C, long stride_C, int ith, int nth) { - - MulMat mm; - - if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) { - return false; - } - - size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA)); - size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB)); - - - auto nrc_x = (Nx + nth - 1)/nth; - auto first_x = ith*nrc_x; - if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; - - DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0}; - - mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); - - return true; -} -// end adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406 - - -bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B, - float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) { - const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping; - assert(row_mapping != nullptr); - - MulMat mm; - int row_size_q8; - /* moonll - - if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) { - return false; - }*/ - int row_size_qx = ggml_row_size((ggml_type)typeA, ne00); - int nrc_x = (Nx + nth - 1)/nth; - int first_x = ith*nrc_x; - if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; - DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)}; - mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); - return true; -} - -#if defined __x86_64__ || defined(_M_X64) - -#if defined HAVE_FANCY_SIMD - #undef HAVE_FANCY_SIMD -#endif -#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) - #define HAVE_FANCY_SIMD -#endif -//#define HAVE_FANCY_SIMD - -namespace { - -inline float hsum_float_4(__m128 x) { - x = _mm_add_ps(x, _mm_movehl_ps(x, x)); - x = _mm_add_ss(x, _mm_movehdup_ps(x)); - return _mm_cvtss_f32(x); -} -inline float hsum_float_8(__m256 x) { - return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1))); -} - -#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) - - -template struct Q8 { - - constexpr static int nrc_y = nrc; - - Q8(const DataInfo& info) { - for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); - } - -#ifdef HAVE_FANCY_SIMD - inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); } -#endif - inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); } - inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); } - inline float scale(int iy, int i) const { return y[iy][i].d; } - - const block_q8 * y[nrc_y]; -}; - -// Handles q4_K and q5_K scales/mins -struct Scales8K { - template - inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { - make_q4_scales(data, utmp); - const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); - const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1); - accum_mins(mins128, q8, i, c, accd); - const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); - return MM256_SET_M128I(sc128, sc128); - } -#ifdef HAVE_FANCY_SIMD - template - inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { - auto scales = process_mins_and_scales(data, c, i, q8, accd); - return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1); - } -#endif - template - inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { - const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i q8s = q8.load_bsums(iy, i); - const __m256i prod = _mm256_madd_epi16(mins, q8s); - accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); - } - } -#ifdef HAVE_FANCY_SIMD - const __m512i shuffles512[2] = { - _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302, - 0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100), - _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, - 0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908) - }; -#endif - const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), - _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; - - uint32_t utmp[4]; -}; - -template -inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i prod = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i)); - accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]); - } -} -inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) { - const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); - const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); - scales[0] = MM256_SET_M128I(l_scales, l_scales); - scales[1] = MM256_SET_M128I(h_scales, h_scales); -} - -struct ScaleQ3 { - inline __m128i make_scales(const uint16_t * s8) const { - const uint16_t * scales16 = (const uint16_t *)s8; - uint32_t aux0 = scales16[0] | (scales16[1] << 16); - uint32_t aux1 = scales16[2] | (scales16[3] << 16); - uint32_t aux2 = scales16[4] | (scales16[5] << 16); - __m128i scales128 = _mm_set_epi32( - ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030), - ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030), - (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030), - (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030)); - return _mm_add_epi8(scales128, m32); - } - const __m128i m32 = _mm_set1_epi8(-32); -}; - -struct ScaleIQ4XS { - inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) { - uint32_t tmp32 = scales_h | (scales_h << 14); - const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4); - const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask); - return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32); - } - const __m128i hshift = _mm_set_epi32(12, 8, 4, 0); - const __m128i lshift = _mm_set_epi32(4, 0, 4, 0); - const __m128i hmask = _mm_set1_epi16(0x03); - const __m128i lmask = _mm_set1_epi8(0xf); - const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400); - const __m128i m32 = _mm_set1_epi16(-32); -}; - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct Scales8KBase { - template - inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { - const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i q8s = q8.load_bsums(iy, i); - const __m256i prod = _mm256_madd_epi16(mins, q8s); - accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); - } - } - inline __m256i shuffle(__m128i mins) const { - return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0])); - } - const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), - _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; -}; -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455 - -template -struct BaseDequantizer { - BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {} - inline void new_row(int ix) { - x = (const Block *)((const char *)vx + bx*ix); - } - - const void * vx; - size_t bx; - const Block * x; - - float d; -}; - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -__m128i inline load_iq4nl_values_128() { - static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; - return _mm_loadu_si128((const __m128i *)kvalues_iq4nl); -} - -__m256i inline load_iq4nl_values_256() { - auto val128 = load_iq4nl_values_128(); - return MM256_SET_M128I(val128, val128); -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698 - -#ifdef HAVE_FANCY_SIMD -//====================================== Zen4 ================================================== - -struct BlockPermuter { - const __m512i permute1 = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0); - const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4); -}; - -struct Q4Bits { - inline void prepare(const uint8_t * q4) { - auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); - auto tmp1 = _mm512_and_si512(q4bits, ml); - auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); - values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); - values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); - q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); - tmp1 = _mm512_and_si512(q4bits, ml); - tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); - values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); - values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); - } - inline void prepare64(const uint8_t * q4) { - auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); - values[0] = _mm512_and_si512(q4bits, ml); - values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); - q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); - values[2] = _mm512_and_si512(q4bits, ml); - values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); - } - __m512i values[4]; - const __m512i ml = _mm512_set1_epi8(0xf); - BlockPermuter perm; -}; - -struct Q2Bits { - inline void prepare(const uint8_t * q2) { - - auto q2bits = _mm512_loadu_si512((const __m512i*)q2); - auto tmp = _mm512_srli_epi16(q2bits, 2); - - values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp); - values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp); - values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml); - values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml); - values[0] = _mm512_and_si512(values[0], ml); - values[2] = _mm512_and_si512(values[2], ml); - } - __m512i values[4]; - const __m512i ml = _mm512_set1_epi8(0x03); - BlockPermuter perm; -}; - -struct DequantizerQ4K final : public BaseDequantizer { - DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - bits.prepare(x[i].qs); - auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); - scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); - scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); - } - - Q4Bits bits; - Scales8K s8k; -}; - -/* -moonll DequantizerIQ4XS -*/ - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -__m512i inline load_iq4nl_values_512() { - auto val256 = load_iq4nl_values_256(); - return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1); -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775 - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct DequantizerIQ4XS final : public BaseDequantizer { - // Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1782 - DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - prepare(x[i].qs); - auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); - s8k.accum_mins(scales128, q8, i, -128.f*d, accd); - auto scales256 = MM256_SET_M128I(scales128, scales128); - auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1); - scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]); - scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]); - scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]); - scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]); - } - inline void prepare(const uint8_t * q4) { - bits.prepare64(q4); - // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111 - // bits.valuse[1]: 16..31, 48...63, 80...95, 112..127 - // etc. - auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]); - bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1])); - bits.values[0] = _mm512_shuffle_epi8(values, tmp); - tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]); - bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3])); - bits.values[2] = _mm512_shuffle_epi8(values, tmp); - } - - Q4Bits bits; - Scales8KBase s8k; - ScaleIQ4XS siq4; - const __m512i values; - const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); - const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); - const __m512i shuffles[4] = { - _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1), - _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1), - _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1), - _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1), - }; -}; -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781 - -struct HighBit5 { - inline void apply(const uint8_t * h, Q4Bits& bits) { - auto hbits256 = _mm256_loadu_si256((const __m256i *)h); - auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); - bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh)); - bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); - bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh)); - bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); - } - const __m512i mh = _mm512_set1_epi8(0x10); -}; - -struct HighBit3 { - inline void apply(const uint8_t * h, Q2Bits& bits) { - auto hbits256 = _mm256_loadu_si256((const __m256i *)h); - auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); - bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); - bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh)); - bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); - bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh)); - } - const __m512i mh = _mm512_set1_epi8(0x04); -}; - -struct DequantizerQ5K final : public BaseDequantizer { - DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - bits.prepare(x[i].qs); - hbits.apply(x[i].qh, bits); - auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); - scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); - scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); - } - - Q4Bits bits; - HighBit5 hbits; - Scales8K s8k; -}; - -struct Scale16 { - inline void make_scales(const __m128i& scales8, __m512i * scales) const { - auto all_scales8 = MM256_SET_M128I(scales8, scales8); - auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1); - auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2); - scales[0] = _mm512_cvtepi8_epi16(scales1); - scales[1] = _mm512_cvtepi8_epi16(scales2); - } - template - inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8, - const Q8& q8, __m256 * accm, __m512i * scales) const { - process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm); - make_scales(scales8, scales); - } - const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202, - 0x05050505, 0x01010101, 0x04040404, 0x00000000); - const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a, - 0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808); -}; - -struct DequantizerQ2K final : public BaseDequantizer { - DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - bits.prepare(x[i].qs); - const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); - const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); - const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); - sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales); - } - - Q2Bits bits; - Scale16 sc16; - const __m128i m4 = _mm_set1_epi8(0xf); - -}; - -struct DequantizerQ3K final : public BaseDequantizer { - DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - bits.prepare(x[i].qs); - hbits.apply(x[i].hmask, bits); - auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales); - sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales); - } - - Q2Bits bits; - HighBit3 hbits; - ScaleQ3 sc3; - Scale16 sc16; - const __m128i m4 = _mm_set1_epi8(0xf); - const __m128i m32 = _mm_set1_epi8(-32); -}; - -struct DequantizerQ6K final : public BaseDequantizer { - DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - bits.prepare64(x[i].ql); - add_high_bits(x[i].qh, bits); - auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales); - sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales); - } - - inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const { - auto hbits = _mm512_loadu_si512((const __m512i *)qh); - auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh); - auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh); - bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); - bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); - tmp1 = _mm512_and_si512(hbits, mh); - tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh); - bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); - bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); - } - - Q4Bits bits; - HighBit3 hbits; - Scale16 sc16; - - const __m512i mh = _mm512_set1_epi8(0x30); - -}; - -template -static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx); - - __m256 accm[nrc_y]; - __m512 accd[nrc_y]; - __m512i scales[2]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); - for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - deq.new_block(i, q8, accm, scales); - - for (int iy = 0; iy < nrc_y; ++iy) { - const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0)); - const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1)); - const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2)); - const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3)); - auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); - sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); - info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); - } - - } -} -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -template -inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) { - const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0)); - const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1)); - const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2)); - const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3)); - auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); - sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); -} - -template -static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx); - - __m256 accm[nrc_y]; - __m512 accd[nrc_y]; - __m512i scales[2]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); - for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - deq.new_block(i, q8, accm, scales); - - for (int iy = 0; iy < nrc_y; ++iy) { - const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0)); - const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1)); - const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2)); - const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3)); - auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); - sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); - info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); - } - - } -} - -template -static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx); - - __m256 accm[nrc_y]; - __m512 accd[nrc_y]; - __m512i scales[4]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); - for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - deq.new_block(i, q8, accm, scales); - - for (int iy = 0; iy < nrc_y; ++iy) { - const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0)); - const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1)); - const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2)); - const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3)); - auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(), - p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]); - accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); - info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); - } - - } -} - -template -static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - constexpr int k_nx = 2; - - Q8<1> q8(info); - - Dequantizer deq1(vx, bx); - Dequantizer deq2(vx, bx); - - Dequantizer * deq[k_nx]; - deq[0] = &deq1; - deq[1] = &deq2; - - __m512i scales[2*k_nx]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - auto accd = _mm512_setzero_ps(); - auto accm = _mm256_setzero_ps(); - - for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix); - - for (int i = 0; i < nb/k_nx; ++i) { - - for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx); - - for (int kx = 0; kx < k_nx; ++kx) { - compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd); - } - - } - if (2*(nb/2) < nb) { - int i0 = 2*(nb/2); - deq[0]->new_block(i0, q8, &accm, scales); - compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd); - } - - auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1)); - info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256))); - } -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408 - -#else -// ===================================== Vanilla AVX2 ===================================== - -struct Q4Bits { - inline void prepare(const uint8_t * q4, int j) { - auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); - values[0] = _mm256_and_si256(q4bits, ml); - values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); - q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); - values[2] = _mm256_and_si256(q4bits, ml); - values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); - } - inline void prepare64(const uint8_t * q4, int j) { - auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); - values[0] = _mm256_and_si256(q4bits, ml); - values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); - q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); - values[1] = _mm256_and_si256(q4bits, ml); - values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); - } - inline void prepare16(const uint8_t * q4, int j) { - values[0] = dequant16(q4 + 64*j + 0); - values[1] = dequant16(q4 + 64*j + 16); - values[2] = dequant16(q4 + 64*j + 32); - values[3] = dequant16(q4 + 64*j + 48); - } - inline __m256i dequant16(const uint8_t * qs) const { - const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); - const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128); - return _mm256_and_si256(ml, aux256); - }; - __m256i values[4]; - const __m256i ml = _mm256_set1_epi8(0xf); -}; - -struct Q2Bits { - inline void prepare(const uint8_t * q2, int j) { - auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j); - values[0] = _mm256_and_si256(q2bits, ml); - values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml); - values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml); - values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml); - } - __m256i values[4]; - const __m256i ml = _mm256_set1_epi8(0x03); -}; - -struct HighBit5 { - inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } - inline void apply(Q4Bits& bits, bool do_shift) { - bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); - bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh)); - bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); - bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); - if (do_shift) { - hbits = _mm256_srli_epi16(hbits, 4); - } - } - const __m256i mh = _mm256_set1_epi8(0x10); - __m256i hbits; -}; - -struct HighBit3 { - inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } - inline void apply(Q2Bits& bits, bool do_shift) { - bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); - bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); - bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); - bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh)); - if (do_shift) { - hbits = _mm256_srli_epi16(hbits, 4); - } - } - const __m256i mh = _mm256_set1_epi8(0x04); - __m256i hbits; -}; - - -/* -template -inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { - if (j == 0) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); - sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); - } - } else { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); - sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); - sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); - } - } -}*/ - -struct DequantizerQ4K final : public BaseDequantizer { - DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { - d = GGML_FP16_TO_FP32(x[i].d); - return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); - } - inline void prepare(int i, int j) { - bits.prepare(x[i].qs, j); - } - - Q4Bits bits; - Scales8K s8k; -}; - -struct DequantizerIQ4XS final : public BaseDequantizer { - DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {} - template - inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { - d = GGML_FP16_TO_FP32(x[i].d); - auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); - s8k.accum_mins(scales128, q8, i, -128.f*d, accd); - return MM256_SET_M128I(scales128, scales128); - } - inline void prepare(int i, int j) { - bits.prepare16(x[i].qs, j); - bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]); - bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]); - bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]); - bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]); - } - - static __m256i load_values() { - static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; - auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); - return MM256_SET_M128I(val128, val128); - } - - Q4Bits bits; - Scales8K s8k; - ScaleIQ4XS siq4; - const __m256i values; -}; - -struct DequantizerQ5K final : public BaseDequantizer { - DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { - d = GGML_FP16_TO_FP32(x[i].d); - hbits.load(x[i].qh); - return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); - } - inline void prepare(int i, int j) { - bits.prepare(x[i].qs, j); - hbits.apply(bits, j == 0); - } - - Q4Bits bits; - HighBit5 hbits; - Scales8K s8k; -}; - -template -inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d, - __m256 * accm, __m256i * scales) { - const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); - process_mins_16(all_scales, q8, i, d, accm); - prepare_scales_16(all_scales, scales); -} - -struct DequantizerQ3K final : public BaseDequantizer { - DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - hbits.load(x[i].hmask); - process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales); - } - inline void prepare(int i, int j) { - bits.prepare(x[i].qs, j); - hbits.apply(bits, j == 0); - } - - Q2Bits bits; - HighBit3 hbits; - ScaleQ3 sc3; - - const __m128i m32 = _mm_set1_epi8(-32); -}; - -struct DequantizerQ2K final : public BaseDequantizer { - DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); - const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); - const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); - process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm); - prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales); - } - inline void prepare(int i, int j) { - bits.prepare(x[i].qs, j); - } - - Q2Bits bits; - - const __m128i m4 = _mm_set1_epi8(0xf); -}; - -struct DequantizerQ6K final : public BaseDequantizer { - DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - template - inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { - d = GGML_FP16_TO_FP32(x[i].d); - process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales); - } - inline void prepare(int i, int j) { - bits.prepare64(x[i].ql, j); - auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j); - bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); - bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); - bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); - bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh)); - } - - Q4Bits bits; - const __m256i mh = _mm256_set1_epi8(0x30); -}; - - -inline __m256i get_scale_shuffle_8(int i); - -inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales); - -inline __m256i get_scale_shuffle_16(int i); - -inline void set_scales_16(const __m256i& all_scales, __m256i* scales); - - -template -static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n%QK_K == 0); - const int nb = n/QK_K; - - Q8 q8(info); - - __m256i all_scales[2]; - __m256i scales[4]; - __m256 accd[nrc_y]; - - Dequantizer deq(vx, bx); - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); - - for (int i = 0; i < nb; ++i) { - - deq.new_block(i, q8, accd, all_scales); - - __m256i sumi[nrc_y]; - - for (int j = 0; j < QK_K/128; ++j) { - deq.prepare(i, j); - set_scales_16(all_scales[j], scales); - multiply_add(deq.bits, scales, j, i, q8, sumi); - } - - for (int iy = 0; iy < nrc_y; ++iy) { - accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, hsum_float_8(accd[iy])); - } - - } - -} - -template -static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx); - - __m256 accd[nrc_y]; - __m256i scales[4]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - auto all_scales = deq.new_block(i, q8, accd); - - __m256i sumi[nrc_y]; - - for (int j = 0; j < QK_K/128; ++j) { - - deq.prepare(i, j); - - set_scales_8(all_scales, j, scales); - - multiply_add(deq.bits, scales, j, i, q8, sumi); - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); - accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); - } - - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, hsum_float_8(accd[iy])); - } - - } -} -#endif // Zen4 or vanilla AVX2 - - - -// -// ============================== Legacy quants -// - -struct DotHelper { - const __m256i m1 = _mm256_set1_epi16(1); -#if defined(__AVX512VNNI__) && defined(__AVX512VL__) - inline __m256i dot(__m256i x, __m256i y) const { - return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y); - } -#else - inline __m256i dot(__m256i x, __m256i y) const { - return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y)); - } -#endif -}; - -struct SignedDot { - DotHelper helper; - inline __m256i compute(__m256i x, __m256i y) const { - return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x)); - } -}; -struct UnsignedDot { - DotHelper helper; - inline __m256i compute(__m256i x, __m256i y) const { - return helper.dot(x, y); - } -}; -template struct Sum4 { - Dot dot; - inline __m256i compute(const __m256i * qx, const Q8 * y) const { - const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); - const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); - const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); - const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); - const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 - const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 - return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 - } -}; - -struct Sum4_Q8 { - SignedDot dot; - static inline __m256i add1(__m256i a, __m256i b) { - return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b)); - } - static inline __m256i add2(__m256i a, __m256i b) { - return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b)); - } - inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const { - const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); - const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); - const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); - const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); - const __m256i p01 = add1(p0, p1); // 0,1, 0,1, 0,1, 0,1 - const __m256i p23 = add1(p2, p3); // 2,3, 2,3, 2,3, 2,3 - return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3 - } -}; - -struct ScaleHelperQ_0 { - ggml_half scales8[4]; - template - inline __m128 prepare4(const Q * y) { - for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; - return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); - } - template - inline __m128 prepare4(__m128 other_scales, const Q * y) { - return _mm_mul_ps(other_scales, prepare4(y)); - } - template inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); } - template inline float prepare1(float d, const Q * y) const { return d*prepare1(y); } -}; -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -template -struct ScaleHelperQ_0_1 { - ggml_half scales8[4]; - template - inline __m256 prepare4(const Q * y) { - for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; - auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); - return _mm256_set_m128(_mm_mul_ps(s4, min), s4); - } - template - inline __m256 prepare4(__m256 other_scales, const Q * y) { - return _mm_mul256_ps(other_scales, prepare4(y)); - } - template inline std::pair prepare1(const Q * y) const { - float d = GGML_FP16_TO_FP32(y->d); - return std::make_pair(d, -d*float(min_value)); - } - std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { - return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); - } - const __m128 min = _mm_set1_ps(float(-min_value)); -}; -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187 - -struct ScaleHelperQ_1 { - uint32_t scales8[4]; - const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); - - template - inline __m256 prepare4(const Q * y) { - for (int j = 0; j < 4; ++j) { - // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers - // complain that this breaks strict-aliasing rules. - memcpy(scales8 + j, &y[j].d, sizeof(uint32_t)); - } - return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle)); - } - - template - inline __m256 prepare4(__m256 other_scales, const Q * y) { - return _mm256_mul_ps(other_scales, prepare4(y)); - } - - template inline std::pair prepare1(const Q * y) const { - return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m)); - } - template inline std::pair prepare1(const std::pair& dm, const Q * y) const { - return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m)); - } - std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { - return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); - } -}; - -struct MinusType0 { - inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); } - inline float compute(float d, int) const { return d; } - inline float result(__m256 acc, int) const { return hsum_float_8(acc); } -}; - -template struct MinusType1 { - __m128 accm[nrc_y]; - MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); } - inline __m256 compute(__m256 dm, int iy) { - const __m128 d = _mm256_castps256_ps128(dm); - const __m128 m = _mm256_extractf128_ps(dm, 1); - accm[iy] = _mm_add_ps(accm[iy], m); - return _mm256_set_m128(d, d); - } - inline float compute(const std::pair& dm, int iy) { - accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f)); - return dm.first; - } - inline float result(__m256 acc, int iy) const { - const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); - return hsum_float_4(_mm_add_ps(sum, accm[iy])); - } -}; - -template struct AccumT { - __m256 acc[nrc_y]; - Minus accm; - AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); } - template - inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) { - auto qx = unp.quants(); - __m256 dall[nrc_y]; - for (int i = 0; i < nb/4; ++i) { - auto other_scales = unp.set_block_4(i); - for (int iy = 0; iy < nrc_y; ++iy) { - auto s12 = scales.prepare4(other_scales, y[iy] + 4*i); - dall[iy] = accm.compute(s12, iy); - } - for (int iy = 0; iy < nrc_y; ++iy) { - auto pall = sum.compute(qx, y[iy] + 4*i); - acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]); - } - } - if (!is_multiple_of_4) { - for (int i = 4*(nb/4); i < nb; ++i) { - auto other_scales = unp.set_block(i); - for (int iy = 0; iy < nrc_y; ++iy) { - auto s12 = scales.prepare1(other_scales, y[iy] + i); - auto d = accm.compute(s12, iy); - const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs)); - acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]); - } - } - } - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, accm.result(acc[iy], iy)); - //s[iy*bs] = accm.result(acc[iy], iy); - } - } -}; - -template -using AccumType0 = AccumT; - -template -using AccumType1 = AccumT, nrc_y, is_multiple_of_4>; - -using Sum4Type0 = Sum4; -using Sum4Type1 = Sum4; - -template -void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { - Unpacker unp(vx, bx); - Sum4Type sum4; - Scales scales; - for (int ix = 0; ix < nrc_x; ++ix) { - unp.set_row(ix); - AccumType accum; - accum.compute(nb, unp, scales, sum4, y, info, ix); - } -} - -template -void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n%Unpacker::block_size() == 0); - Q8 q8(info); - int nb = n/Unpacker::block_size(); - if (nb%4 == 0) { - mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } else { - mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } -} - -template -void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n%Unpacker::block_size() == 0); - Q8 q8(info); - int nb = n/Unpacker::block_size(); - if (nb%4 == 0) { - mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } else { - mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } -} - -struct Dequantizer4bit { - const __m256i m4 = _mm256_set1_epi8(0xf); - inline __m256i dequant(const uint8_t * qs) const { - const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); - return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4); - } -}; - -struct Q8_0_Dequantizer { - inline __m256i dequant(const block_q8_0 * x) const { - return _mm256_loadu_si256((const __m256i *)x->qs); - } -}; - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct Q8_0_1_Dequantizer { - inline __m256i dequant(const block_q8_0 * x) const { - return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs)); - } -}; -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455 - -struct Q4_0_Dequantizer { - Dequantizer4bit b4; - const __m256i m8 = _mm256_set1_epi8(-8); - inline __m256i dequant(const block_q4_0 * x) const { - return _mm256_add_epi8(b4.dequant(x->qs), m8); - } -}; - -struct Q4_1_Dequantizer { - Dequantizer4bit b4; - inline __m256i dequant(const block_q4_1 * x) const { - return b4.dequant(x->qs); - } -}; - -struct HBitDequantizer { - const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); - const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); - const __m256i minus1 = _mm256_set1_epi64x(-1); - inline __m256i to_bytes(const uint8_t * bits) const { - // Note: Data in all ggml quants is at least 2-byte aligned. - // => we can cast to uint16_t and use or on two consecutive entries - // which is faster than memcpy - const uint16_t * aux16 = (const uint16_t *)bits; - const uint32_t aux32 = aux16[0] | (aux16[1] << 16); - //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t)); - __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle); - bytes = _mm256_or_si256(bytes, mask); - return _mm256_cmpeq_epi8(bytes, minus1); - } -}; - -struct Q5_0_Dequantizer { - Dequantizer4bit b4; - HBitDequantizer hbit; - const __m256i mh = _mm256_set1_epi8((char)0xF0); - inline __m256i dequant(const block_q5_0 * x) const { - const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh); - return _mm256_or_si256(b4.dequant(x->qs), vqh); - } -}; - -struct Q5_1_Dequantizer { - Dequantizer4bit b4; - HBitDequantizer hbit; - const __m256i mh = _mm256_set1_epi8(0x10); - inline __m256i dequant(const block_q5_1 * x) const { - const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh); - return _mm256_or_si256(b4.dequant(x->qs), vqh); - } -}; - -template -struct Q_Unpacker { - Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {} - - const char * cx_0; - const Q * x; - size_t bx; - - Scales scales; - Dequantizer deq; - - __m256i qx[4]; - - inline const __m256i* quants() const { return qx; } - - inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); } - - inline auto set_block_4(int i) { - for (int j = 0; j < 4; ++j) { - qx[j] = deq.dequant(x + 4*i + j); - } - return scales.prepare4(x + 4*i); - } - inline auto set_block(int i) { - qx[0] = deq.dequant(x + i); - return scales.prepare1(x + i); - } -}; - -struct Q8_0_Unpacker final : public Q_Unpacker { - Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - inline static int block_size() { return QK4_0; } -}; -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct Q8_0_1_Unpacker final : public Q_Unpacker, Q8_0_1_Dequantizer> { - Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} -// using Sum4T = Sum4TypeQ81; - inline static int block_size() { return QK8_0; } -}; -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574 -struct Q4_0_Unpacker final : public Q_Unpacker { - Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - inline static int block_size() { return QK4_0; } -}; -struct Q5_0_Unpacker final : public Q_Unpacker { - Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - inline static int block_size() { return QK5_0; } -}; -struct Q4_1_Unpacker final : public Q_Unpacker { - Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - inline static int block_size() { return QK4_1; } -}; -struct Q5_1_Unpacker final : public Q_Unpacker { - Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} - inline static int block_size() { return QK4_1; } -}; - -template -void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n%Q8_0_Unpacker::block_size() == 0); - Q8 q8(info); - int nb = n/Q8_0_Unpacker::block_size(); - if (nb%4 == 0) { - mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } else { - mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( - nb, vx, bx, info, q8.y, nrc_x - ); - } -} - - - - -/* -moonll -add some structs for DequantizerIQ2XXS -SimpleBits -EvenSignHelper -*/ -struct SimpleBits { - __m256i values[4]; -}; - -// fix for #829: Add checks of AVX512VPOPCNTDQ -#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__) -#define HAVE_AVX512_POPCNT 1 -#else -#define HAVE_AVX512_POPCNT 0 -#endif - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7736 -// with the addition of a branch that handles a missing _mm256_popcnt_epi32 instruction -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct EvenSignHelper { - #if defined HAVE_FANCY_SIMD - // #pragma message("Using AVX512VPOPCNTDQ in even sign helper") - union sbits_t { - __m128i vec; - __mmask32 mask[4]; - }; - IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const { - aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask); - - // fix for #829: Compatibility with processors using Intel Cascade Lake architecture - // If AVX512VPOPCNTDQ extension is not supported, use alternative implementation - #if HAVE_AVX512_POPCNT - auto pcnt = _mm256_popcnt_epi32(aux); - - #else - // Alternative implementation: Using standard bit counting method - __m256i pcnt; - int* pcnt_ptr = reinterpret_cast(&pcnt); - int* aux_ptr = reinterpret_cast(&aux); // Get address of aux directly, avoid unnecessary copies - - #pragma unroll 8 // Hint compiler to unroll loops, increasing throughput of SIMD computing - for (int i = 0; i < 8; i++) { - pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // Use compiler builtin popcount - } - #endif - - sbits_t sbits; - sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); - values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]); - values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]); - //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); - //const __mmask32 * m32 = (const __mmask32 *)&sign_bits; - //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]); - //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]); - } - const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0); - const __m256i mask = _mm256_set1_epi32(127); - const __m256i mone = _mm256_set1_epi32(1); - #else - inline void sign_value(uint32_t aux32, __m256i& value) const { - auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127], - keven_signs[(aux32 >> 7) & 127], keven_signs[(aux32 >> 0) & 127]); - value = _mm256_sign_epi8(value, signs); - } - #endif -}; - -/* -moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1 -add func -get_scale_shuffle_8 -get_scale_shuffle_16 -set_scales_16 -*/ - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -inline __m256i get_scale_shuffle_8(int i) { - return _mm256_set1_epi16((2*i) | ((2*i+1) << 8)); -} - -inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) { - scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0)); - scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1)); - scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2)); - scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3)); -} - - -inline __m256i get_scale_shuffle_16(int i) { - static const uint8_t k_shuffle[128] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, - 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, - 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - return _mm256_loadu_si256((const __m256i*)k_shuffle + i); -} - -inline void set_scales_16(const __m256i& all_scales, __m256i * scales) { - scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0)); - scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1)); - scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2)); - scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3)); -} - -template -inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { - if (j == 0) { -#ifdef HAVE_FANCY_SIMD - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); - } -#else - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); - sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); - } -#endif - } else { -#ifdef HAVE_FANCY_SIMD - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); - sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); - } -#else - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); - sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); - sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); - } -#endif - } -} - -/* -moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1 -add func -set_scales_8_iq -set_scales_16_iq - -add MUL_MAT -mul_mat_qX_K_q8_K_IQ_1 -mul_mat_qX_K_q8_K_IQ_N -mul_mat_qX_K_q8_K_IQ -*/ - -template -inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) { - if (j == 0) { -#ifdef HAVE_FANCY_SIMD - auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); - auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); - auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); - auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); - sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2)); - sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4)); -#else - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); - sumi[0] = _mm256_add_epi32(p1, p3); - sumi[1] = _mm256_add_epi32(p2, p4); -#endif - } else { -#ifdef HAVE_FANCY_SIMD - auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); - auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); - auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); - auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); - sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2)); - sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4)); -#else - const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); - const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); - const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); - const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); - sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3)); - sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4)); -#endif - } -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578 - - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) { - //#ifdef HAVE_FANCY_SIMD - auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100) - : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908); - scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); - scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4))); - //#else - // set_scales_8(all_scales, j, scales); - //#endif - } - -inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) { - #ifdef HAVE_FANCY_SIMD - auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100); - scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); - scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8))); - #else - set_scales_16(all_scales, scales); - #endif - } -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278 - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -template -static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - const int nb = n / QK_K; - Q8<1> q8(info); - Dequantizer deq(vx, bx); - __m256i scales[2]; - __m256i q8_quants[4]; - for (int ix = 0; ix < nrc_x; ++ix) { - - __m256 accd = _mm256_setzero_ps(); - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - __m256i sumi[2], all_scales[Dequantizer::num_blocks/8]; - deq.new_block(i, all_scales); - - for (int j = 0; j < QK_K/128; ++j) { - deq.prepare(i, j, q8, q8_quants); - if constexpr (Dequantizer::num_blocks == 8) { - set_scales_8_iq(j, all_scales[0], scales); - } else { - set_scales_16_iq(all_scales[j], scales); - } - multiply_add_1(j, deq.bits, scales, q8_quants, sumi); - } - accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd); - } - - info.store(ix, 0, hsum_float_8(accd)); - } - } - - -template -static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - const int nb = n / QK_K; - Q8 q8(info); - Dequantizer deq(vx, bx); - __m256i scales[4]; - __m256 accd[nrc_y]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); - - deq.new_row(ix); - - for (int i = 0; i < nb; ++i) { - - __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8]; - //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256(); - __m256i mins; - float dmin = deq.new_block(i, all_scales, mins); - for (int iy = 0; iy < nrc_y; ++iy) { - auto bsums = q8.load_bsums(iy, i); - auto prod = _mm256_madd_epi16(mins, bsums); - accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); - } - - for (int j = 0; j < QK_K/128; ++j) { - deq.prepare(i, j); - if constexpr (Dequantizer::num_blocks == 8) { - set_scales_8(all_scales[0], j, scales); - } else { - set_scales_16(all_scales[j], scales); - } - //multiply_add_iq(deq.bits, scales, j, i, q8, sumi); - multiply_add(deq.bits, scales, j, i, q8, sumi); - } - for (int iy = 0; iy < nrc_y; ++iy) { - const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); - accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); - } - } - - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, hsum_float_8(accd[iy])); - } - } -} - -template -static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); -#ifdef HAVE_FANCY_SIMD - if constexpr (nrc_y == 1) { - mul_mat_qX_K_q8_K_IQ_1(n, vx, bx, info, nrc_x); - } else { - mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); - } -#else - mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); -#endif -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299 - -/* -moonll iq1s -core func for iq1s mul_mat_iq1_s_q8_K - -*/ -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -template -static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - GGML_ASSERT(n%QK_K == 0); - Q8 q8(info); - __m256i qx[8]; - __m256i scales[4]; - __m256 acc[nrc_y] = {}; - auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000 - __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100); - for (int ix = 0; ix < nrc_x; ++ix) { - auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx); - for (int ibl = 0; ibl < n/QK_K; ++ibl) { - float d = GGML_FP16_TO_FP32(iq1s[ibl].d); - auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh); - auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7)); - scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1)); -#ifdef HAVE_FANCY_SIMD - auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask); - auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9)); -#else - auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask); - auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7))); -#endif - deltas128 = _mm_mullo_epi16(scales128, deltas128); - scales128 = _mm_slli_epi16(scales128, 3); - auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128); - auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128); - auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7 - auto all_scales = MM256_SET_M128I(scales128, scales128); - auto shuffle = shuffle0; - for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { - scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle); - shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)); - } - const uint8_t * qs = iq1s[ibl].qs; - const uint16_t * qh = iq1s[ibl].qh; - for (int ib = 0; ib < QK_K/32; ib += 2) { - qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)], - iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]); - qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)], - iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]); - qs += 8; - } - for (int iy = 0; iy < nrc_y; ++iy) { - auto bsums = q8.load_bsums(iy, ibl); - auto sumi = _mm256_setzero_si256(); - for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { - auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0); - auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1); -#ifdef HAVE_FANCY_SIMD - auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1); - auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2); - sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2)); -#else - auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1); - auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2); - auto dot = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2)); - sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot)); -#endif - } -#ifdef HAVE_FANCY_SIMD - sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas); -#else - sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas)); -#endif - acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]); - } - } - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, 0.125f*hsum_float_8(acc[iy])); - acc[iy] = _mm256_setzero_ps(); - } - } -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813 - -/* -moonll iq1s -DequantizerIQ2XXS -DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S -*/ - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8035 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct DequantizerIQ2XXS final : public BaseDequantizer { - DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} - - constexpr static int num_blocks = 8; - - union Data { - __m256i vec; - uint32_t val[8]; - }; - - inline __m128i load_scales(int i) { - d = 0.125f * GGML_FP16_TO_FP32(x[i].d); - const uint16_t * a16 = (const uint16_t *)x[i].qs; - auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12); - return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1)); - } - - inline void new_block(int i, __m256i * scales) { - auto sc16 = load_scales(i); - scales[0] = MM256_SET_M128I(sc16, sc16); - } - inline float new_block(int i, __m256i * scales, __m256i& mins) { - auto sc16 = load_scales(i); - mins = scb.shuffle(sc16); - scales[0] = MM256_SET_M128I(sc16, sc16); - return -d*minv; - } - - inline static void make4(const uint32_t * aux32, __m256i * values) { - const uint8_t * aux8 = (const uint8_t *)aux32; - values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]); - values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]); - values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]); - values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]); - } - - IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const { -#ifdef HAVE_FANCY_SIMD - esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0); - esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2); -#else - esh.sign_value(aux32[1], values[0]); - esh.sign_value(aux32[3], values[1]); - esh.sign_value(aux32[5], values[2]); - esh.sign_value(aux32[7], values[3]); -#endif - } - inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const { - make4(aux32, values); - sign_values(aux32, values); - for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value); - } - inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const { - make4(aux32, values); - sign_values(aux32, q8); - } - inline void prepare(int i, int j) { - Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); - make4_signed(data.val, min_value, bits.values); - } - inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) { - for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k); - Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); - make4(data.val, bits.values, q8_quants); - } - - constexpr static int minv = 43; - SimpleBits bits; - Scales8KBase scb; - EvenSignHelper esh; - const __m256i min_value = _mm256_set1_epi8(minv); - const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1); -}; - -/* -moonll -add Q8_0_Unpacker && DequantizerIQ2XXS support -add func mul_mat_qX_K_q8_K_IQ -*/ - -// Copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -template void MulMat::set_functions(MulMat& m) { - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { - m.funcs[0] = mul_mat_qX_0_q8_0_T; - m.funcs[1] = mul_mat_qX_0_q8_0_T; - m.funcs[2] = mul_mat_qX_0_q8_0_T; - m.funcs[3] = mul_mat_qX_0_q8_0_T; - m.funcs[4] = mul_mat_qX_0_q8_0_T; - m.funcs[5] = mul_mat_qX_0_q8_0_T; - m.funcs[6] = mul_mat_qX_0_q8_0_T; - m.funcs[7] = mul_mat_qX_0_q8_0_T; - } - else if constexpr (std::is_same_v || std::is_same_v|| std::is_same_v) { - m.funcs[0] = mul_mat_qX_1_q8_1_T; - m.funcs[1] = mul_mat_qX_1_q8_1_T; - m.funcs[2] = mul_mat_qX_1_q8_1_T; - m.funcs[3] = mul_mat_qX_1_q8_1_T; - m.funcs[4] = mul_mat_qX_1_q8_1_T; - m.funcs[5] = mul_mat_qX_1_q8_1_T; - m.funcs[6] = mul_mat_qX_1_q8_1_T; - m.funcs[7] = mul_mat_qX_1_q8_1_T; - } - else if constexpr (std::is_same_v) { - m.funcs[0] = mul_mat_qX_K_q8_K_IQ; - m.funcs[1] = mul_mat_qX_K_q8_K_IQ; - m.funcs[2] = mul_mat_qX_K_q8_K_IQ; - m.funcs[3] = mul_mat_qX_K_q8_K_IQ; - m.funcs[4] = mul_mat_qX_K_q8_K_IQ; - m.funcs[5] = mul_mat_qX_K_q8_K_IQ; - m.funcs[6] = mul_mat_qX_K_q8_K_IQ; - m.funcs[7] = mul_mat_qX_K_q8_K_IQ; - } - else { -#ifdef HAVE_FANCY_SIMD - if constexpr (std::is_same_v) { - m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512; - m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512; - } else { - m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; - m.funcs[1] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[2] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[3] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[4] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[5] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[6] = mul_mat_qX_K_q8_K_AVX512; - m.funcs[7] = mul_mat_qX_K_q8_K_AVX512; - } -#else - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - m.funcs[0] = mul_mat_qY_K_q8_K_T; - m.funcs[1] = mul_mat_qY_K_q8_K_T; - m.funcs[2] = mul_mat_qY_K_q8_K_T; - m.funcs[3] = mul_mat_qY_K_q8_K_T; - m.funcs[4] = mul_mat_qY_K_q8_K_T; - m.funcs[5] = mul_mat_qY_K_q8_K_T; - m.funcs[6] = mul_mat_qY_K_q8_K_T; - m.funcs[7] = mul_mat_qY_K_q8_K_T; - } else { - m.funcs[0] = mul_mat_qX_K_q8_K_T; - m.funcs[1] = mul_mat_qX_K_q8_K_T; - m.funcs[2] = mul_mat_qX_K_q8_K_T; - m.funcs[3] = mul_mat_qX_K_q8_K_T; - m.funcs[4] = mul_mat_qX_K_q8_K_T; - m.funcs[5] = mul_mat_qX_K_q8_K_T; - m.funcs[6] = mul_mat_qX_K_q8_K_T; - m.funcs[7] = mul_mat_qX_K_q8_K_T; - } -#endif - } -} -// end copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092 - -// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -struct QFBase { - #ifdef __AVX512F__ - constexpr static int k_step = 16; - using Data = __m512; - using Acc = __m512; - static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); } - static inline Data load(const float * x) { return _mm512_loadu_ps(x); } - static inline Data load(const ggml_bf16_t * x) { - return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16)); - } - static inline Acc acc(Acc prev, const Data& y, const Data& x) { - return _mm512_fmadd_ps(y, x, prev); - } - static inline Acc acc_first(const Data& y, const Data& x) { - return _mm512_mul_ps(y, x); - } - static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); } - static inline float hsum(Acc acc) { - return _mm512_reduce_add_ps(acc); - } - template - static inline Data load4Floats(const Float * x) { - return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0); - } - static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { - acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc); - acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); - acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); - acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); - return acc; - } - static inline Acc acc_r4_first(const Data * xv, const Data& yv) { - auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00)); - acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); - acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); - acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); - return acc; - } - static inline __m128 hsum_r4(Acc acc) { - auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1)); - auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3)); - return _mm_add_ps(sum1, sum2); - } - #else - constexpr static int k_step = 8; - using Data = __m256; - using Acc = __m256; - static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); } - static inline Data load(const float * x) { return _mm256_loadu_ps(x); } - static inline Data load(const ggml_bf16_t * x) { - return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16)); - } - static inline Acc acc(Acc prev, const Data& y, const Data& x) { - return _mm256_fmadd_ps(y, x, prev); - } - static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); } - static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { - acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc); - acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); - acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); - acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); - return acc; - } - static inline Acc acc_r4_first(const Data * xv, const Data& yv) { - auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00)); - acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); - acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); - acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); - return acc; - } - static inline Acc acc_first(const Data& y, const Data& x) { - return _mm256_mul_ps(y, x); - } - static inline float hsum(Acc acc) { - return hsum_float_8(acc); - } - static inline __m128 hsum_r4(Acc acc) { - return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); - } - template - static inline Data load4Floats(const Float * x) { - return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0); - } - #endif - static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); } - static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); } - static inline __m128 load128(const ggml_bf16_t * x) { - return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16)); - } - }; - template struct QFT final : public QFBase { - constexpr static int nrc = nrc_in; - QFT(const DataInfo& info) { - for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy); - } - QFT(const char * cx, size_t bx) { - for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx); - } - IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); } - IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); } - IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const { - xv[0] = load1(ix+0, i); - xv[1] = load1(ix+1, i); - xv[2] = load1(ix+2, i); - xv[3] = load1(ix+3, i); - #ifdef __AVX512F__ - auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]); - auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]); - auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]); - auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]); - xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); - xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); - xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); - xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); - #else - auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]); - auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]); - auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]); - auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]); - xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); - xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); - xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); - xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); - #endif - } - const Float * y[nrc]; - }; - - - -template -IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) { - int nb = n/QFBase::k_step; - int nb4 = n/4; - Qy y(info); - Qx x(cx + ix0*bx, bx); - QFBase::Data xv[Qx::nrc]; - QFBase::Acc acc[Qx::nrc*Qy::nrc]; - auto yv = y.load1(0, 0); - for (int ix = 0; ix < Qx::nrc; ++ix) { - xv[ix] = x.load1(ix, 0); - acc[ix] = QFBase::acc_first(yv, xv[ix]); - } - for (int iy = 1; iy < Qy::nrc; ++iy) { - yv = y.load1(iy, 0); - for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]); - } - for (int i = 1; i < nb; ++i) { - yv = y.load1(0, i); - for (int ix = 0; ix < Qx::nrc; ++ix) { - xv[ix] = x.load1(ix, i); - acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); - } - for (int iy = 1; iy < Qy::nrc; ++iy) { - yv = y.load1(iy, i); - for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); - } - } - for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) { - yv = y.load_tail(0, i); - for (int ix = 0; ix < Qx::nrc; ++ix) { - xv[ix] = x.load_tail(ix, i); - acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); - } - for (int iy = 1; iy < Qy::nrc; ++iy) { - yv = y.load_tail(iy, i); - for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); - } - } - for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix])); -} -// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done -// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in -// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now. -template -void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - const char * cx = (const char *)vx; - // TBD if we want this - //if constexpr (nrc_y == 1) { - // constexpr int k_nx = 2; - // for (int ix = 0; ix < nrc_x/k_nx; ++ix) { - // mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, ix*k_nx, info); - // } - // if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) { - // int nx = nrc_x - lastx; - // switch (nx) { - // case 1: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; - // case 2: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; - // case 3: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; - // } - // //mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); - // } - // return; - //} -#ifdef __AVX512F__ - constexpr int k_nx = 5; -#else - constexpr int k_nx = nrc_y == 1 ? 4 : 2; -#endif - for (int ix = 0; ix < nrc_x/k_nx; ++ix) { - mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, ix*k_nx, info); - } - int last_x = k_nx*(nrc_x/k_nx); - if (last_x == nrc_x) return; - int nx = nrc_x - last_x; -#ifdef __AVX512F__ - switch (nx) { - case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - case 4: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - } -#else - if constexpr (nrc_y == 1) { - switch (nx) { - case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - } - } else { - switch (nx) { - case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; - } - } -#endif -} - -template -void set_mul_mat_f(MulMat& mm) { - for (auto& f : mm.funcs) f = nullptr; - mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>; - mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>; - mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>; - mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>; - mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>; -#ifndef __AVX512F__ - mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>; -#endif -} -// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622 - -/* -moonll -add typeb TO compare return not expected type of weight matrix -add IQ2XSS -add IQ1_S -add GGML_TYPE_IQ4_XS -*/ - -// Modifications extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231 -// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow -bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { - (void)Ny; - - auto expected_typeB = GGML_TYPE_Q8_K; - switch (typeA) { - case GGML_TYPE_Q2_K: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_Q3_K: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_Q4_K: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_Q5_K: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_Q6_K: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_IQ4_XS: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_IQ2_XXS: - assert (ne00 % QK_K == 0); - MulMat::set_functions(mm); - break; - case GGML_TYPE_Q4_0: - assert (ne00 % QK4_0 == 0); - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_0; - break; - case GGML_TYPE_Q4_1: - assert (ne00 % QK4_1 == 0); - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; - break; - case GGML_TYPE_Q5_0: - assert (ne00 % QK5_0 == 0); - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_0; - break; - case GGML_TYPE_Q5_1: - assert (ne00 % QK5_1 == 0); - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; - break; - case GGML_TYPE_Q8_0: - assert (ne00 % QK8_0 == 0); -#ifdef HAVE_FANCY_SIMD - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_1_X4; -#else - MulMat::set_functions(mm); - expected_typeB = GGML_TYPE_Q8_0_X4; -#endif - break; - case GGML_TYPE_IQ1_S: - mm.funcs[0] = mul_mat_iq1_s_q8_K<1>; - mm.funcs[1] = mul_mat_iq1_s_q8_K<2>; - mm.funcs[2] = mul_mat_iq1_s_q8_K<3>; - mm.funcs[3] = mul_mat_iq1_s_q8_K<4>; - mm.funcs[4] = mul_mat_iq1_s_q8_K<5>; - mm.funcs[5] = mul_mat_iq1_s_q8_K<6>; - mm.funcs[6] = mul_mat_iq1_s_q8_K<7>; - mm.funcs[7] = mul_mat_iq1_s_q8_K<8>; - #ifdef HAVE_FANCY_SIMD - mm.func16 = mul_mat_iq1_s_q8_K<16>; - #endif - // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); - expected_typeB = GGML_TYPE_Q8_K; - break; - - default: - { - // printf("case:%d",typeA); - return false; - } - - } - - - - return ggml_type(typeB) == expected_typeB; - -} -// end extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231 - -} // namespace - -/* -iq1_s is not support for arm -*/ -#else // __aarch64__ - -namespace { - -template struct Q8 { - - constexpr static int nrc_y = nrc; - - Q8(const DataInfo& info) { - for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); - } - - inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); } - inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); } - inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); } - inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); } - inline int16x8_t load_bsums8(int iy, int i) const { - auto q8s = vld1q_s16_x2(y[iy][i].bsums); - return vpaddq_s16(q8s.val[0], q8s.val[1]); - } - inline float scale(int iy, int i) const { return y[iy][i].d; } - - const block_q8 * y[nrc_y]; -}; - -template -IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx, nrc_y); - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - float32x4_t acc[nrc_y]; - for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); - -//#pragma GCC unroll 4 - for (int i = 0; i < nb; ++i) { - - int32x4_t sumi[nrc_y]; - for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); - - if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) { - deq.process_scales(i, q8, acc); - deq.prepare(i, 0); - deq.compute(q8, i, 0, sumi); - deq.prepare(i, 1); - deq.compute(q8, i, 1, sumi); - } else { - if constexpr (Dequantizer::num_blocks() == 8) { - auto scales = deq.new_block(i, q8, acc); - deq.prepare(i, 0); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); - deq.prepare(i, 1); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); - } - else if constexpr (Dequantizer::num_blocks() == 16) { - auto scales = deq.new_block(i, q8, acc); - deq.prepare(i, 0); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); - deq.prepare(i, 1); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); - } - else { - GGML_ASSERT(false); - } - } - -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); - } - } - -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, vaddvq_f32(acc[iy])); - } - } -} -template -IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - - Dequantizer deq(vx, bx, nrc_y); - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - float32x4_t acc[nrc_y]; - for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); - - for (int i = 0; i < nb; ++i) { - - int32x4_t sumi[nrc_y]; - for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); - - if constexpr (Dequantizer::num_blocks() == 8) { - auto scales = deq.new_block(i); - deq.prepare(i, 0); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); - deq.prepare(i, 1); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); - } - else if constexpr (Dequantizer::num_blocks() == 16) { - auto scales = deq.new_block(i); - deq.prepare(i, 0); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); - deq.prepare(i, 1); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); - } - else { - GGML_ASSERT(false); - } -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); - } - } -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, vaddvq_f32(acc[iy])); - } - } -} - -template -IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, - const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) { - auto mzero = vdupq_n_s32(0); - const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val; - const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val; - - auto q8b_1 = q8.load_quants(iy, i, 4*j+0); - auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1 - auto q8b_2 = q8.load_quants(iy, i, 4*j+1); - auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2 - auto p12 = vpaddq_s32(p1, p2); - - auto q8b_3 = q8.load_quants(iy, i, 4*j+2); - auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3 - auto q8b_4 = q8.load_quants(iy, i, 4*j+3); - auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4 - auto p34 = vpaddq_s32(p3, p4); - - auto pall = vpaddq_s32(p12, p34); - sumi = vmlaq_s32(sumi, scales.val[j], pall); -} -template -IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8, - const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { - auto mzero = vdupq_n_s32(0); - - auto q8b_1 = q8.load_quants(iy, i, 4*j+0); - auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1 - auto q8b_2 = q8.load_quants(iy, i, 4*j+1); - auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2 - auto p12 = vpaddq_s32(p1, p2); - - auto q8b_3 = q8.load_quants(iy, i, 4*j+2); - auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3 - auto q8b_4 = q8.load_quants(iy, i, 4*j+3); - auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4 - auto p34 = vpaddq_s32(p3, p4); - - auto pall = vpaddq_s32(p12, p34); - sumi = vmlaq_s32(sumi, scales, pall); -} - -template -IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, - const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { - - auto mzero = vdupq_n_s32(0); - auto q8b_1 = q8.load_quants(iy, i, 4*j+0); - auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]), - ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1, - auto q8b_2 = q8.load_quants(iy, i, 4*j+1); - auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]), - ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4, - auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3 - sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12); - - auto q8b_3 = q8.load_quants(iy, i, 4*j+2); - auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]), - ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5, - auto q8b_4 = q8.load_quants(iy, i, 4*j+3); - auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]), - ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7, - auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7 - sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34); -} - -template -inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - auto q8s = q8.load_bsums8(iy, i); - int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s)); - int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s)); - float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2)); - acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); - } -} -template -inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - auto q8s = q8.load_bsums(iy, i); - int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0])); - int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0])); - int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1])); - int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1])); - float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4))); - acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); - } -} - -struct Scales8 { - uint32_t utmp[4]; - const uint8_t * sc8 = (const uint8_t *)utmp; - template - inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) { - make_q4_scales(x.scales, utmp); - int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8)); - accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin)); - - uint8x8_t scales8 = vld1_u8(sc8); - uint16x8_t scales16 = vmovl_u8(scales8); - int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))}; - return scales; - } -}; - -struct Q4bits { - const uint8x16_t m4b = vdupq_n_u8(0xf); - uint8x16x4_t b1, b2; - inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const { - b.val[0] = vandq_u8(val[0], m4b); - b.val[2] = vshrq_n_u8(val[0], 4); - b.val[1] = vandq_u8(val[1], m4b); - b.val[3] = vshrq_n_u8(val[1], 4); - } - inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const { - b.val[0] = vandq_u8(val[0], m4b); - b.val[1] = vshrq_n_u8(val[0], 4); - b.val[2] = vandq_u8(val[1], m4b); - b.val[3] = vshrq_n_u8(val[1], 4); - } - inline void prepare(const uint8_t * qs) { - auto q4bits = vld1q_u8_x2(qs); - prepare4(b1, q4bits.val); - q4bits = vld1q_u8_x2(qs+32); - prepare4(b2, q4bits.val); - } - inline void prepare_v2(const uint8_t * qs) { - auto q4bits = vld1q_u8_x4(qs); - prepare4(b1, q4bits.val+0); - prepare4(b2, q4bits.val+2); - } - inline void prepare64(const uint8_t * qs) { - auto q4bits = vld1q_u8_x4(qs); - b1.val[0] = vandq_u8(q4bits.val[0], m4b); - b1.val[1] = vandq_u8(q4bits.val[1], m4b); - b1.val[2] = vandq_u8(q4bits.val[2], m4b); - b1.val[3] = vandq_u8(q4bits.val[3], m4b); - b2.val[0] = vshrq_n_u8(q4bits.val[0], 4); - b2.val[1] = vshrq_n_u8(q4bits.val[1], 4); - b2.val[2] = vshrq_n_u8(q4bits.val[2], 4); - b2.val[3] = vshrq_n_u8(q4bits.val[3], 4); - } - inline void prepare16(const uint8_t * qs) { - auto q4bits = vld1q_u8_x2(qs); - prepare4_16(b1, q4bits.val); - q4bits = vld1q_u8_x2(qs+32); - prepare4_16(b2, q4bits.val); - } - inline void prepare16_v2(const uint8_t * qs) { - auto q4bits = vld1q_u8_x4(qs); - prepare4_16(b1, q4bits.val+0); - prepare4_16(b2, q4bits.val+2); - } -}; - -struct Q2bits { - const uint8x16_t m4b = vdupq_n_u8(0x03); - uint8x16x4_t b1, b2; - inline void prepare(const uint8_t * qs) { - auto q2bits = vld1q_u8_x2(qs); - b1.val[0] = vandq_u8(q2bits.val[0], m4b); - b1.val[1] = vandq_u8(q2bits.val[1], m4b); - - q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); - q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); - b1.val[2] = vandq_u8(q2bits.val[0], m4b); - b1.val[3] = vandq_u8(q2bits.val[1], m4b); - - q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); - q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); - b2.val[0] = vandq_u8(q2bits.val[0], m4b); - b2.val[1] = vandq_u8(q2bits.val[1], m4b); - - q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); - q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); - b2.val[2] = vandq_u8(q2bits.val[0], m4b); - b2.val[3] = vandq_u8(q2bits.val[1], m4b); - } -}; - -template -struct BaseDequantizer { - BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {} - inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); } - const void * vx; - const block_q * x; - const size_t bx; - const int nrc; -}; - -struct DequantizerQ4K final : public BaseDequantizer { - DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 8; } - constexpr static bool should_scale_quants() { return false; } - - template - inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { - d = GGML_FP16_TO_FP32(x[i].d); - return s8.process_scales_mins(x[i], q8, i, acc); - } - inline void prepare(int i, int j) { - if (nrc == 1) bits.prepare_v2(x[i].qs+64*j); - else bits.prepare(x[i].qs+64*j); - } - - Q4bits bits; - Scales8 s8; - - float d; -}; - -struct HighBit5 { - const uint8x16_t mhb = vdupq_n_u8(0x10); - uint8x16x2_t bits; - inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { - b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb)); - b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb)); - b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb)); - b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb)); - - b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); - b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); - b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); - b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); - - if (do_shift) { - bits.val[0] = vshrq_n_u8(bits.val[0], 4); - bits.val[1] = vshrq_n_u8(bits.val[1], 4); - } - } -}; - -struct HighBit3 { - const uint8x16_t mhb = vdupq_n_u8(0x04); - uint8x16x2_t bits; - inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { - b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); - b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); - b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); - b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); - - b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb)); - b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb)); - b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb)); - b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb)); - - if (do_shift) { - bits.val[0] = vshrq_n_u8(bits.val[0], 4); - bits.val[1] = vshrq_n_u8(bits.val[1], 4); - } - } -}; - -struct DequantizerQ5K final : public BaseDequantizer { - DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 8; } - constexpr static bool should_scale_quants() { return false; } - - template - inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { - d = GGML_FP16_TO_FP32(x[i].d); - h.bits = vld1q_u8_x2(x[i].qh); - return s8.process_scales_mins(x[i], q8, i, acc); - } - inline void prepare(int i, int j) { - bits.prepare(x[i].qs+64*j); - h.apply(bits.b1, bits.b2, j == 0); - } - - Q4bits bits; - HighBit5 h; - Scales8 s8; - - uint8x16x2_t hbits; - - float d; -}; - -inline int32x4x4_t make_wider(const int16x8x2_t& scales16) { - int32x4x4_t scales = { - vmovl_s16(vget_low_s16 (scales16.val[0])), - vmovl_s16(vget_high_s16(scales16.val[0])), - vmovl_s16(vget_low_s16 (scales16.val[1])), - vmovl_s16(vget_high_s16(scales16.val[1])), - }; - return scales; -} - -template -inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) { - int16x8x2_t scales16; - scales16.val[0] = vmovl_s8(vget_low_s8(scales8)); - scales16.val[1] = vmovl_s8(vget_high_s8(scales8)); - accum_mins_16(scales16, q8, acc, i, c); - return make_wider(scales16); -} - -struct DequantizerQ6K final : public BaseDequantizer { - DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return false; } - - template - inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { - d = GGML_FP16_TO_FP32(x[i].d); - return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d); - } - inline void prepare(int i, int j) { - - auto hbits = vld1q_u8_x2(x[i].qh + 32*j); - - bits.prepare64(x[i].ql+64*j); - bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb)); - bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb)); - bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb)); - bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb)); - - bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb)); - bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb)); - bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb)); - bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb)); - - } - - Q4bits bits; - - const uint8x16_t mhb = vdupq_n_u8(0x30); - - float d; -}; - -struct DequantizerQ3K final : public BaseDequantizer { - DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return false; } - - template - inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { - d = GGML_FP16_TO_FP32(x[i].d); - h.bits = vld1q_u8_x2(x[i].hmask); - const uint16_t * sc16 = (const uint16_t *)x[i].scales; - uint32_t aux0 = sc16[0] | (sc16[1] << 16); - uint32_t aux1 = sc16[2] | (sc16[3] << 16); - uint32_t aux2 = sc16[4] | (sc16[5] << 16); - aux32[0] = (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030); - aux32[1] = (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030); - aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030); - aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030); - return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d); - } - - inline void prepare(int i, int j) { - bits.prepare(x[i].qs+32*j); - h.apply(bits.b1, bits.b2, j == 0); - } - - uint32_t aux32[4]; - - Q2bits bits; - - HighBit3 h; - - float d; -}; - -struct DequantizerQ2K final : public BaseDequantizer { - DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return true; } - - template - inline void process_scales(int i, const Q8& q8, float32x4_t * acc) { - d = GGML_FP16_TO_FP32(x[i].d); - auto scales_and_mins = vld1q_u8(x[i].scales); - auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4)); - int16x8x2_t scales16; - scales16.val[0] = vmovl_s8(vget_low_s8(mins8)); - scales16.val[1] = vmovl_s8(vget_high_s8(mins8)); - accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin)); - - scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf)); - } - - template - inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { - process_scales(i, q8, acc); - int16x8x2_t scales16; - scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8))); - scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8))); - return make_wider(scales16); - } - - template - inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) { - auto m1 = vdupq_n_u8(1); - auto shuffle = vdupq_n_u8(8*j); - bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - auto q8b_1 = q8.load_quants(iy, i, 4*j+0); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]), - vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]); - - auto q8b_2 = q8.load_quants(iy, i, 4*j+1); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]), - vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]); - - auto q8b_3 = q8.load_quants(iy, i, 4*j+2); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]), - vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]); - - auto q8b_4 = q8.load_quants(iy, i, 4*j+3); - sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]), - vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]); - } - } - - inline void prepare(int i, int j) { - bits.prepare(x[i].qs+32*j); - } - - uint32_t aux32[4]; - - uint8x16_t scales8; - - Q2bits bits; - - float d; -}; - -// ============================= i-quants - -struct DequantizerIQ4XS final : public BaseDequantizer { - - static int8x16_t load_values() { - static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - return vld1q_s8(iq4nl_values); - } - - DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {} - - constexpr static int num_blocks() { return 8; } - constexpr static bool should_scale_quants() { return false; } - - inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); } - - template - inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { - (void)q8; - (void)acc; - d = GGML_FP16_TO_FP32(x[i].d); - const uint16_t scales_h = x[i].scales_h; - const uint16_t * scales_l = (const uint16_t *)x[i].scales_l; - aux32[0] = scales_l[0] | (scales_l[1] << 16); - aux32[1] = aux32[0] >> 4; - // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7 - uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf)); - uint16_t * aux16 = (uint16_t *)aux32; - aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2; - // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7 - uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30)); - int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32)); - // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7 - scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff)); - int16x8_t scales16 = vmovl_s8(scales8); - int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))}; - return scales; - } - inline void prepare(int i, int j) { - bits.prepare16(x[i].qs+64*j); - for (int k = 0; k < 4; ++k) { - bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k])); - bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k])); - } - } - - Q4bits bits; - const int8x16_t values; - uint32_t aux32[2]; - - constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602}; - - float d; -}; - -struct SimpleBits { - uint8x16x4_t b1; - uint8x16x4_t b2; -}; - -IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) { - int32x4x2_t scales; - auto one = vdupq_n_u32(1); - scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1)); - scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1)); - return scales; -} - -inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) { - auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127)))); - auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127)))); - b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1)); - b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2)); -} - -IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) { - return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1)); -} - -struct DequantizerIQ2XXS final : public BaseDequantizer { - DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); } - - inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { - auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j)); - prepare_all(data, q); - return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1])); - } - -private: - - static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) { - const uint8_t * idx = (const uint8_t *)bits; - b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]}); - b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]}); - apply_signs_2(b, signs, bits[1]); - } - - inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) { - const uint32_t * q2 = (const uint32_t *)data.val; - prepare2(quants+0, q2+0, keven_signs); - prepare2(quants+2, q2+2, keven_signs); - prepare2(quants+4, q2+4, keven_signs); - prepare2(quants+6, q2+6, keven_signs); - } -}; - -inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) { - auto aux = vld1_u8(sc); - auto scales_l = vand_u8(aux, vdup_n_u8(0xf)); - auto scales_h = vshr_n_u8(aux, 4); - auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); - - auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1))); - int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) }; - return make_wider(scales16); -} - -struct DequantizerIQ2XS final : public BaseDequantizer { - DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return false; } - - SimpleBits bits; - float d; - - inline int32x4x4_t new_block(int i) { - d = 0.125f * GGML_FP16_TO_FP32(x[i].d); - prepare_internal(i, 0); - return prepare_4bit_scales16(x[i].scales); - } - - inline void prepare(int i, int j) { - if (j == 1) prepare_internal(i, 1); - } - -private: - - static void make2(const uint16_t * qs, uint8x16_t * b) { - auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511)))); - auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511)))); - auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9)))); - auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9)))); - b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1)); - b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2)); - } - - inline static void make4(const uint16_t * qs, uint8x16_t * b) { - make2(qs + 0, b + 0); - make2(qs + 4, b + 2); - } - - IQK_ALWAYS_INLINE void prepare_internal(int i, int j) { - make4(x[i].qs + 16*j + 0, bits.b1.val); - make4(x[i].qs + 16*j + 8, bits.b2.val); - } - -}; - -// So, I hate to include this table, but with the GCC 12.3 compiler -// bundled in the Cosmopolitan tools, loading the unpacked sign bytes -// from this table using the packed 8 sign bits as index is faster than -// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to -// expand the bits to bytes. -static const uint64_t kall_signs[256] = { - 0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff, - 0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff, - 0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff, - 0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff, - 0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff, - 0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff, - 0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff, - 0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff, - 0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff, - 0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff, - 0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff, - 0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff, - 0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff, - 0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff, - 0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff, - 0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff, - 0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff, - 0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff, - 0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff, - 0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff, - 0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff, - 0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff, - 0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff, - 0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff, - 0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff, - 0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff, - 0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff, - 0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff, - 0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff, - 0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff, - 0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff, - 0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff, - 0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff, - 0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff, - 0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff, - 0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff, - 0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff, - 0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff, - 0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff, - 0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff, - 0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff, - 0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff, - 0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff, - 0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff, - 0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff, - 0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff, - 0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff, - 0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff, - 0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff, - 0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff, - 0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff, - 0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff, - 0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff, - 0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff, - 0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff, - 0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff, - 0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff, - 0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff, - 0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff, - 0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff, - 0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff, - 0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff, - 0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff, - 0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff, -}; - -struct SignHelper { - - IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const { - auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]}); - // Normally we would expect this to be faster, but it isn't. - // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1])); - // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1)); - b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s)); - } - - // We would need these two if we weren't loading from the unpacked sign table. - //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201)); - //const uint8x16_t m1 = vdupq_n_u8(1); -}; - -struct DequantizerIQ2S final : public BaseDequantizer { - DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 16; } - constexpr static bool should_scale_quants() { return false; } - - SimpleBits bits; - float d; - - inline int32x4x4_t new_block(int i) { - d = 0.125f * GGML_FP16_TO_FP32(x[i].d); - prepare_internal(i, 0, bits); - return prepare_4bit_scales16(x[i].scales); - } - - inline void prepare(int i, int j) { - if (j == 1) prepare_internal(i, 1, bits); - } - -private: - - static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) { - uint32_t aux32[2]; - const uint16_t * aux16 = (const uint16_t *)aux32; - for (int k = 0; k < 2; ++k) { - aux32[1] = (qh[k] << 4) | (qh[k] << 18); - aux32[0] = (aux32[1] << 4) & 0x03000300; - aux32[1] &= 0x03000300; - b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))), - vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1])))); - b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))), - vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3])))); - sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2; - sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2; - } - } - - void prepare_internal(int i, int j, SimpleBits& sb) { - - const auto * qs = x[i].qs + 16*j; - const auto * qh = x[i].qh + 4*j; - const auto * sign_bits = qs + QK_K/8; - - make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val); - make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val); - } - - SignHelper sh; -}; - -struct DequantizerIQ3XXS final : public BaseDequantizer { - DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); } - - inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { - auto q3data = vld1q_u8_x2(x[i].qs + 32*j); - auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j)); - prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q); - return prepare_scales_8(gas); - } - -private: - - inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) { - b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]}); - b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]}); - apply_signs_2(b, keven_signs, sidx); - } - inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) { - make2(q3+ 0, signs[0], quants + 0); - make2(q3+ 8, signs[1], quants + 2); - make2(q3+16, signs[2], quants + 4); - make2(q3+24, signs[3], quants + 6); - } -}; - -struct DequantizerIQ3S final : public BaseDequantizer { - DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} - - constexpr static int num_blocks() { return 8; } - constexpr static bool should_scale_quants() { return false; } - - SimpleBits bits; - float d; - - inline int32x4x2_t new_block(int i) { - d = GGML_FP16_TO_FP32(x[i].d); - uint32_t scales32[2]; - auto qs = vld1q_u8_x2(x[i].qs); - auto signs = vld1q_u8(x[i].signs); - - prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs); - - std::memcpy(scales32, x[i].scales, 4); - scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; - scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; - auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7 - scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400))); - auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8)); - int32x4x2_t scales; - scales.val[0] = vmovl_s16(vget_low_s16(scales16)); - scales.val[1] = vmovl_s16(vget_high_s16(scales16)); - return scales; - } - - inline void prepare(int i, int j) { - if (j == 1) { - auto qs = vld1q_u8_x2(x[i].qs + 32); - auto signs = vld1q_u8(x[i].signs + 16); - prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs); - } - } - -private: - - static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh, - const int16x8_t& hshift, uint8x16_t * b) { - auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256))); - const uint16_t * idx = (const uint16_t *)&vindex; - b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]}); - sh.apply_signs_1x(b+0, sign_bits+0); - b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]}); - sh.apply_signs_1x(b+1, sign_bits+2); - } - static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, - const int16x8_t& hshift, uint8x16_t * b) { - auto idx_l = vld1q_u8(qs); - make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0); - make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2); - } - - static int16x8_t load_shift() { - static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; - return vld1q_s16(k_shift); - } - - inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) { - auto signs = vld1q_u8(sign_bits); - auto s = (const uint8_t *)&signs; - make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val); - make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val); - } - - SignHelper sh; - const int16x8_t hshift = load_shift(); - -}; - -template -IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - assert(n % QK_K == 0); - const int nb = n / QK_K; - - Q8 q8(info); - Dequantizer deq(vx, bx, nrc_y); - uint8x16_t qx[8]; - int32x4_t sumi[nrc_y]; - float32x4_t acc[nrc_y]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); - - for (int i = 0; i < nb; ++i) { - float d = deq.new_block(i); - auto scales = deq.unpack(i, 0, qx); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - sumi[iy] = vdupq_n_s32(0); - compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]); - } - scales = deq.unpack(i, 1, qx); -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]); - acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy])); - } - } -#pragma GCC unroll 8 - for (int iy = 0; iy < nrc_y; ++iy) { - info.store(ix, iy, vaddvq_f32(acc[iy])); - } - } -} - -// =========================================== Legacy quants - -template -inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) { - for (int k = 0; k < 4; ++k) aux[k] = x[k].d; - return vld1_f16((const float16_t *)aux); -} - -template -inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) { - if constexpr (std::is_same_v) { - for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; } - } else { - for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; } - } - return vld1q_f16((const float16_t *)aux); -} - -struct Q4LegacyBits { - template - inline void prepare(const Block * x) { - for (int i = 0; i < 4; ++i) { - auto q4bits = vld1q_u8(x[i].qs); - b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); - b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); - } - } - inline void prepare1(const uint8_t * qs, int8x16_t * q) const { - auto q4bits = vld1q_u8(qs); - q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); - q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); - } - inline void prepare1(const uint8_t * qs) { - prepare1(qs, b); - } - const uint8x16_t m4b = vdupq_n_u8(0xf); - int8x16_t b[8]; -}; - -// One would think this commented out version would do better than the one below -// because it offers more opportunities to execute instructions in parallel. -// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers -// cannot it just do the sequential version below on its own? -//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { -// const auto q8b_1 = vld1q_s8_x2(qs + 0); -// auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]); -// const auto q8b_2 = vld1q_s8_x2(qs + 32); -// auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]); -// auto p1234 = vpaddq_s32(p12, p34); -// const auto q8b_3 = vld1q_s8_x2(qs + 64); -// auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]); -// const auto q8b_4 = vld1q_s8_x2(qs + 96); -// auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]); -// return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); -//} - -inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { - auto q8b = vld1q_s8_x2(qs + 0); - auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]); - q8b = vld1q_s8_x2(qs + 32); - auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]); - auto p1234 = vpaddq_s32(p12, p34); - q8b = vld1q_s8_x2(qs + 64); - auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]); - q8b = vld1q_s8_x2(qs + 96); - auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]); - return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); -} - -template struct Q80 { - - constexpr static int nrc_y = nrc; - - Q80(const DataInfo& info) { - for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy); - } - - inline const int8_t * quant_data(int iy, int i) const { - const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; - return y4->qs; - } - - inline float16x4_t load_scales(int iy, int i) const { - const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; - return vld1_f16((const float16_t *)y4->d); - } - - template - inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const { - auto qx_scales = deq.new_block(i); - for (int iy = 0; iy < nrc; ++iy) { - auto q8_scales = load_scales(iy, i); - sc16[iy] = vmul_f16(qx_scales, q8_scales); - } - } - - template - inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { - deq.prepare1(i); - float d = GGML_FP16_TO_FP32(deq.x[i].d); - for (int iy = 0; iy < nrc; ++iy) { - auto q8b = vld1q_s8_x2(y[iy][i].qs); - auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); - acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); - } - } - - const block_q8_0 * y[nrc_y]; -}; - -template struct Q81 { - - constexpr static int nrc_y = nrc; - - Q81(const DataInfo& info) { - for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy); - } - - inline const int8_t * quant_data(int iy, int i) const { - const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; - return y4->qs; - } - - inline float16x8_t load_scales(int iy, int i) const { - const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; - return vld1q_f16((const float16_t *)y4->d); - } - - template - inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const { - auto qx_scales = deq.new_block(i); - for (int iy = 0; iy < nrc; ++iy) { - auto q8_scales = load_scales(iy, i); - auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales)); - acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m)); - sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales)); - } - } - - template - inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { - deq.prepare1(i); - float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m); - for (int iy = 0; iy < nrc; ++iy) { - auto q8b = vld1q_s8_x2(y[iy][i].qs); - auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); - acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); - acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s))); - } - } - - const block_q8_1 * y[nrc_y]; -}; - -template -struct BaseLegacyDequantizer { - - BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {} - - inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); } - - Q4LegacyBits bits; - - const void * vx; - const block_q * x; - size_t bx; -}; - -struct DequantizerQ40 final : public BaseLegacyDequantizer { - - DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} - - inline void prepare1(int i, int8x16_t * q) const { - bits.prepare1(x[i].qs, q); - q[0] = vaddq_s8(q[0], m8); - q[1] = vaddq_s8(q[1], m8); - } - inline void prepare1(int i) { - prepare1(i, bits.b); - } - - inline float16x4_t new_block(int i) { - ggml_half aux[4]; - for (int k = 0; k < 4; ++k) { - aux[k] = x[4*i+k].d; - prepare1(4*i+k, bits.b + 2*k); - } - return vld1_f16((const float16_t *)aux); - } - - const int8x16_t m8 = vdupq_n_s8(-8); - //ggml_half aux[4]; -}; - -struct DequantizerQ41 : public BaseLegacyDequantizer { - - DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} - - inline void prepare1(int i) { - bits.prepare1(x[i].qs); - } - - inline float16x8_t new_block(int i) { - uint32_t aux32[4]; - const uint32_t * s32 = (const uint32_t *)&x[4*i].d; - for (int k = 0; k < 4; ++k) { - aux32[k] = *s32; s32 += sizeof(block_q4_1)/4; - bits.prepare1(x[4*i+k].qs, bits.b + 2*k); - } - return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); - } - // Leaving this commented out attempt to be reminded that I already tried this. - // It has basically the same performance as the version above. - //inline float16x8_t new_block(int i) { - // uint32x4_t scales = {}; - // const block_q4_1 * xi = x + 4*i; - // const uint32_t * s32 = (const uint32_t *)&xi->d; - // scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4; - // bits.prepare1(xi[0].qs, bits.b + 0); - // scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4; - // bits.prepare1(xi[1].qs, bits.b + 2); - // scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4; - // bits.prepare1(xi[2].qs, bits.b + 4); - // scales = vsetq_lane_u32(*s32, scales, 3); - // bits.prepare1(xi[3].qs, bits.b + 6); - // return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle))); - //} - - const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; -}; - -struct HighBit5Legacy { - inline uint8x16_t to_bytes(const uint8_t * qh) const { - uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); - return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask)); - } - inline uint8x16_t to_negated_bytes(const uint8_t * qh) const { - uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); - return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0)); - } - const uint64x2_t mask = vdupq_n_u64(0x8040201008040201); - const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1)); -}; - -struct DequantizerQ50 final : public BaseLegacyDequantizer { - - DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} - - inline void prepare1(int i, int8x16_t * q) const { - bits.prepare1(x[i].qs, q); - auto qh = x[i].qh; - q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0)))); - q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2)))); - } - inline void prepare1(int i) { - prepare1(i, bits.b); - } - - inline float16x4_t new_block(int i) { - ggml_half aux[4]; - for (int k = 0; k < 4; ++k) { - aux[k] = x[4*i+k].d; - prepare1(4*i+k, bits.b + 2*k); - } - return vld1_f16((const float16_t *)aux); - } - - HighBit5Legacy hbits; - - const uint8x16_t mh = vdupq_n_u8(0xf0); - -}; - -struct DequantizerQ80 final : public BaseLegacyDequantizer { - - DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} - - inline void prepare1(int i) { - bits.b[0] = vld1q_s8(x[i].qs); - bits.b[1] = vld1q_s8(x[i].qs+16); - } - - inline float16x4_t new_block(int i) { - ggml_half aux[4]; - for (int k = 0; k < 4; ++k) { - aux[k] = x[4*i+k].d; - bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs); - bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16); - } - return vld1_f16((const float16_t *)aux); - } - -}; - -struct DequantizerQ51 final : public BaseLegacyDequantizer { - - DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} - - inline void prepare1(int i, int8x16_t * q) const { - bits.prepare1(x[i].qs, q); - auto qh = x[i].qh; - q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0)))); - q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2)))); - } - inline void prepare1(int i) { - bits.prepare1(x[i].qs, bits.b); - } - - inline float16x8_t new_block(int i) { - uint32_t aux32[4]; - const uint32_t * s32 = (const uint32_t *)&x[4*i].d; - for (int k = 0; k < 4; ++k) { - aux32[k] = *s32; s32 += sizeof(block_q5_1)/4; - prepare1(4*i+k, bits.b + 2*k); - } - return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); - } - - HighBit5Legacy hbits; - - const uint8x16_t mh = vdupq_n_u8(0x10); - const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; - -}; - -template -inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) { - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i)); - auto scale = vcvt_f32_f16(sc16[iy]); - acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall)); - } -} - -template -inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) { - const int nb = n / QK4_1; - - float16x4_t sc16[Q8::nrc_y]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq.new_row(ix); - - float32x4_t acc[Q8::nrc_y]; - for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); - - for (int i = 0; i < nb/4; ++i) { - q8.process_scales(i, deq, sc16, acc); - sum_4(i, deq, q8, sc16, acc); - } - for (int i = 4*(nb/4); i < nb; ++i) { - q8.process_1_block(i, deq, acc); - } - - for (int iy = 0; iy < Q8::nrc_y; ++iy) { - info.store(ix, iy, vaddvq_f32(acc[iy])); - } - } -} - -template -inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) { - const int nb = n / QK4_1; - - float16x4_t sc16[2]; - - for (int ix = 0; ix < nrc_x; ++ix) { - - deq1.new_row(ix); - deq2.new_row(ix); - - float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) }; - - for (int i = 0; i < nb/8; ++i) { - q8.process_scales(2*i+0, deq1, sc16+0, acc+0); - q8.process_scales(2*i+1, deq2, sc16+1, acc+1); - sum_4(2*i+0, deq1, q8, sc16+0, acc+0); - sum_4(2*i+1, deq2, q8, sc16+1, acc+1); - } - for (int i = 2*(nb/8); i < nb/4; ++i) { - q8.process_scales(i, deq1, sc16, acc); - sum_4(i, deq1, q8, sc16, acc); - } - for (int i = 4*(nb/4); i < nb; ++i) { - q8.process_1_block(i, deq1, acc); - } - - info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1]))); - } -} - -template -static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - Q81 q8(info); - if constexpr (nrc_y == 1) { - Dequantizer deq1(vx, bx), deq2(vx, bx); - mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); - } else { - Dequantizer deq(vx, bx); - mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); - } -} - -template -static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - Q80 q8(info); - if constexpr (nrc_y == 1) { - Dequantizer deq1(vx, bx), deq2(vx, bx); - mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); - } else { - Dequantizer deq(vx, bx); - mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); - } -} - -template -static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - Dequantizer deq1(vx, bx), deq2(vx, bx); - Q81<1> q8(info); - mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); -} - -template -static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { - Dequantizer deq1(vx, bx), deq2(vx, bx); - Q80<1> q8(info); - mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x); -} - -template void MulMat::set_functions(MulMat& m) { - if constexpr (std::is_same_v || std::is_same_v || - std::is_same_v) { - m.funcs[0] = mul_mat_qX_0_q8_0; - m.funcs[1] = mul_mat_qX_0_q8_0; - m.funcs[2] = mul_mat_qX_0_q8_0; - m.funcs[3] = mul_mat_qX_0_q8_0; - m.funcs[4] = mul_mat_qX_0_q8_0; - m.funcs[5] = mul_mat_qX_0_q8_0; - m.funcs[6] = mul_mat_qX_0_q8_0; - m.funcs[7] = mul_mat_qX_0_q8_0; - } - else if constexpr (std::is_same_v || std::is_same_v) { - m.funcs[0] = mul_mat_qX_1_q8_1; - m.funcs[1] = mul_mat_qX_1_q8_1; - m.funcs[2] = mul_mat_qX_1_q8_1; - m.funcs[3] = mul_mat_qX_1_q8_1; - m.funcs[4] = mul_mat_qX_1_q8_1; - m.funcs[5] = mul_mat_qX_1_q8_1; - m.funcs[6] = mul_mat_qX_1_q8_1; - m.funcs[7] = mul_mat_qX_1_q8_1; - } - else if constexpr (std::is_same_v || std::is_same_v) { - m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>; - m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>; - m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>; - m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>; - m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>; - m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>; - m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>; - m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>; - } - else if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>; - m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>; - m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>; - m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>; - m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>; - m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>; - m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>; - m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>; - } - else { - m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>; - m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>; - m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>; - m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>; - m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>; - m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>; - m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>; - m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>; - } -} - -bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) { - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); - - (void)Ny; - // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications. - //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S || - // typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false; - - switch (typeA) { - case GGML_TYPE_Q2_K: - MulMat::set_functions(m); - break; - case GGML_TYPE_Q3_K: - MulMat::set_functions(m); - break; - case GGML_TYPE_Q4_K: - MulMat::set_functions(m); - break; - case GGML_TYPE_Q5_K: - MulMat::set_functions(m); - break; - case GGML_TYPE_Q6_K: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ4_XS: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ3_S: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ3_XXS: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ2_S: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ2_XS: - MulMat::set_functions(m); - break; - case GGML_TYPE_IQ2_XXS: - MulMat::set_functions(m); - break; - case GGML_TYPE_Q4_0: - MulMat::set_functions(m); - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); - break; - case GGML_TYPE_Q4_1: - MulMat::set_functions(m); - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); - break; - case GGML_TYPE_Q5_0: - MulMat::set_functions(m); - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); - break; - case GGML_TYPE_Q5_1: - MulMat::set_functions(m); - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); - break; - case GGML_TYPE_Q8_0: - MulMat::set_functions(m); - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); - break; - default: - return false; - } - return true; -} - -} - -#endif // __x86_64__ or __aarch64__ + // 使用 ARM 版本 + #include "qk_mul_mat_x86.inc" +#endif \ No newline at end of file diff --git a/third_party/llamafile/iqk_mul_mat_arm.inc b/third_party/llamafile/iqk_mul_mat_arm.inc new file mode 100644 index 0000000..1b02295 --- /dev/null +++ b/third_party/llamafile/iqk_mul_mat_arm.inc @@ -0,0 +1,5866 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc +// Copyrigth 2024 Iwan Kawrakow. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp fenc=utf-8 :vi +// +// Copyright 2024 Iwan Kawrakow +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64) + +#include "llama.cpp/ggml-impl.h" +#include "llama.cpp/ggml-quants.h" +#include "sgemm.h" + +// For i-quants, I had to explicitely specify which +// functions to inline / not inline (at least for some +// of the functions), else performance would be significantly +// lower. This is worrysome as things can change with, +// e.g., a different compiler version or running on a different +// CPU. +#ifdef _MSC_VER +#define IQK_NOINLINE __declspec(noinline) +#define IQK_ALWAYS_INLINE inline +#else +#define IQK_NOINLINE __attribute__((__noinline__)) +#define IQK_ALWAYS_INLINE __attribute__((always_inline)) +#endif + +#define GGML_COMMON_IMPL_C +#include "llama.cpp/ggml-common.h" + +// clang-format off + +// This matrix - vector and matrix - matrix multiplication implementation +// for legacy quants, k-quants and i-quants makes prompt processing 150-200% +// (legacy and k-quants) or 250-400% (i-quants) faster. +// compared to mainline llama.cpp (and llamafile). +// It provides implementations for ARM_NEON (all quants) and AVX2 +// (all quants except sub-4 bit i-quants). +// +// Main idea is that unpacking the quants and the block scales to +// be ready for dot products with the corresponding Q8_Y quants +// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type). +// Hence, if we are performing a QX x Q8_Y matrix matrix +// multiplication (as needed for prompt processing), we can get +// a significant speedup by reusing the unpacked QX quants and scales +// for multiplication with several Q8_K columns. We also achieve fewer +// loads from memory, which is the main purpose of tiling in general +// purpose matrix multiplication packages. + +#include +#include + +#endif + +constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast(98); +constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast(99); + + +namespace { +#define GEMV_Q4K +#define GEMV_Q6K +#define GEMM_Q4K_Q6K + +typedef struct { + int32_t i1; + int32_t i2; +} mmid_row_mapping; + +struct DataInfo { + float * s; + const char * cy; + size_t bs; + size_t by; + int cur_y = 0; + int ne11; + const mmid_row_mapping * row_mapping = nullptr; + size_t bs2 = 0; + + inline const char * src1_row(int iy) const { + if (!row_mapping) return cy + (cur_y + iy)*by; + int i11 = row_mapping[cur_y + iy].i1 % ne11; + int i12 = row_mapping[cur_y + iy].i2; + return cy + (i11 + i12*ne11)*by; + } + + inline void store(int ix, int iy, float result) const { + *(dst_row(iy) + ix) = result; + //dst_row(iy)[ix] = result; + } + inline float* ptr(int ix, int iy) const { + return dst_row(iy) + ix; + } + inline float * dst_row(int iy) const { + if (!row_mapping) return s + (cur_y + iy)*bs; + int i12 = row_mapping[cur_y + iy].i2; + int i1 = row_mapping[cur_y + iy].i1; + int i2 = i12; + return s + i1*bs + i2*bs2; + } +}; + +/* +moonll +change param for set_mul_mat +add func16 +*/ + +typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x); +typedef void (*mul_mat_t_v2)(int m, int n, int k, const void *vx, size_t bx, const DataInfo& info); + +struct MulMat { + std::array funcs = {}; + mul_mat_t func16 = nullptr; + mul_mat_t_v2 funcs_v2; + //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { + IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { + constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small) + + if (func16 && nrc_y >= 16) { + int n_step = (nrc_y - info.cur_y)/16; + for (int ix = 0; ix < nrc_x; ix += k_x_step) { + auto this_info = info; + this_info.s += ix; + int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; + for (int iy = 0; iy < n_step; ++iy) { + func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); + this_info.cur_y += 16; + } + } + info.cur_y += 16 * n_step; + if (info.cur_y == nrc_y) return; + } + + int n_step = (nrc_y - info.cur_y)/funcs.size(); + if (n_step > 0) { + for (int ix = 0; ix < nrc_x; ix += k_x_step) { + auto this_info = info; + this_info.s += ix; + int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; + for (int iy = 0; iy < n_step; ++iy) { + funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); + this_info.cur_y += funcs.size(); + } + } + info.cur_y += funcs.size() * n_step; + } + int n_left = nrc_y - info.cur_y; + if (n_left > 0) { + funcs[n_left-1](n, vx, bx, info, nrc_x); + } + } +#if defined __x86_64__ || defined(_M_X64) + static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny); +#else + IQK_NOINLINE void mul_mat_NxM_v2(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { + funcs_v2(nrc_x, nrc_y, n, vx, bx, info); + return; + } + static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny); +#endif +private: + template static IQK_NOINLINE void set_functions(MulMat& m); +}; + +inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) { + const uint16_t * scales = (const uint16_t *)scales8; + const uint32_t a0 = scales[0] | (scales[1] << 16); + const uint32_t a1 = scales[2] | (scales[3] << 16); + const uint32_t a2 = scales[4] | (scales[5] << 16); + aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030); + aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030); + aux32[2] = a1 & 0x3f3f3f3f; + aux32[0] = a0 & 0x3f3f3f3f; +} + +/* +moonll +decoding tables +*/ +#ifdef __AVX2__ +static const uint64_t iq1s_grid_us[2048] = { + 0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200, + 0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000, + 0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101, + 0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101, + 0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202, + 0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200, + 0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001, + 0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202, + 0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201, + 0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001, + 0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101, + 0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101, + 0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202, + 0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200, + 0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201, + 0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002, + 0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101, + 0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200, + 0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102, + 0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101, + 0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001, + 0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100, + 0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200, + 0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101, + 0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100, + 0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000, + 0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202, + 0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200, + 0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101, + 0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201, + 0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002, + 0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001, + 0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001, + 0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002, + 0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000, + 0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101, + 0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000, + 0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101, + 0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202, + 0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201, + 0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000, + 0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100, + 0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102, + 0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002, + 0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000, + 0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101, + 0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101, + 0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200, + 0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002, + 0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001, + 0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101, + 0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101, + 0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101, + 0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102, + 0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100, + 0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002, + 0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100, + 0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000, + 0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101, + 0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101, + 0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001, + 0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102, + 0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201, + 0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202, + 0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001, + 0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001, + 0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101, + 0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102, + 0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200, + 0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, + 0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101, + 0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000, + 0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201, + 0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101, + 0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202, + 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102, + 0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101, + 0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100, + 0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002, + 0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201, + 0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101, + 0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002, + 0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202, + 0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101, + 0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000, + 0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100, + 0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102, + 0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102, + 0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101, + 0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101, + 0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001, + 0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201, + 0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002, + 0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001, + 0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100, + 0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101, + 0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001, + 0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101, + 0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000, + 0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001, + 0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101, + 0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101, + 0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000, + 0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001, + 0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001, + 0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102, + 0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102, + 0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101, + 0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201, + 0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202, + 0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202, + 0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101, + 0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001, + 0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000, + 0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101, + 0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200, + 0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100, + 0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100, + 0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202, + 0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102, + 0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201, + 0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202, + 0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002, + 0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001, + 0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001, + 0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101, + 0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202, + 0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201, + 0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102, + 0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200, + 0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001, + 0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101, + 0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201, + 0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001, + 0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002, + 0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000, + 0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202, + 0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201, + 0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201, + 0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101, + 0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100, + 0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000, + 0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101, + 0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202, + 0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101, + 0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202, + 0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202, + 0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201, + 0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002, + 0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102, + 0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102, + 0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000, + 0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000, + 0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101, + 0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101, + 0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202, + 0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200, + 0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102, + 0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101, + 0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100, + 0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001, + 0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100, + 0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101, + 0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001, + 0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200, + 0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, + 0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101, + 0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100, + 0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101, + 0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101, + 0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101, + 0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202, + 0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100, + 0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201, + 0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202, + 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102, + 0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200, + 0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201, + 0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000, + 0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002, + 0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100, + 0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000, + 0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100, + 0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000, + 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102, + 0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100, + 0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002, + 0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001, + 0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201, + 0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202, + 0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100, + 0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001, + 0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002, + 0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001, + 0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201, + 0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001, + 0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101, + 0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101, + 0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101, + 0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101, + 0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102, + 0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100, + 0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001, + 0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000, + 0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001, + 0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101, + 0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100, + 0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000, + 0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202, + 0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101, + 0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100, + 0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100, + 0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200, + 0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100, + 0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, + 0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101, + 0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201, + 0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001, + 0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201, + 0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201, + 0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001, + 0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200, + 0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100, + 0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201, + 0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200, + 0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, + 0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001, + 0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102, + 0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001, + 0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201, + 0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100, + 0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000, + 0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102, + 0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001, + 0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202, + 0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102, + 0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101, + 0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201, + 0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101, + 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102, + 0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101, + 0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100, + 0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202, + 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, + 0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202, + 0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101, + 0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200, + 0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101, + 0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100, + 0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002, + 0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201, + 0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100, + 0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202, + 0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102, + 0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002, + 0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200, + 0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002, + 0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200, + 0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001, + 0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200, + 0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100, + 0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000, + 0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102, + 0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100, + 0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000, + 0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102, + 0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100, + 0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000, + 0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101, + 0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001, + 0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201, + 0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002, + 0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200, + 0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100, + 0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101, + 0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202, + 0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002, + 0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201, + 0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201, + 0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001, + 0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202, + 0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102, + 0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002, + 0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201, + 0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200, + 0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002, + 0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100, + 0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101, + 0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102, + 0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002, + 0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200, + 0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100, + 0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001, + 0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100, + 0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201, + 0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101, + 0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102, + 0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201, + 0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200, + 0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200, + 0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002, + 0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202, + 0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102, + 0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000, + 0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202, + 0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201, + 0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001, + 0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002, + 0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102, + 0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001, + 0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101, + 0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202, + 0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102, + 0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201, + 0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101, + 0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101, + 0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001, + 0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202, + 0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000, + 0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202, + 0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102, + 0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002, + 0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201, + 0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101, + 0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001, + 0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200, + 0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102, + 0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102, + 0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100, + 0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001, + 0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201, + 0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001, + 0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202, + 0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200, + 0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000, + 0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000, + 0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001, + 0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200, + 0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200, + 0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202, + 0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201, + 0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202, + 0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001, + 0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001, + 0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200, + 0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000, + 0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102, + 0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101, + 0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100, + 0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000, + 0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100, + 0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100, + 0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102, + 0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201, + 0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202, + 0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102, + 0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102, + 0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202, + 0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202, + 0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100, + 0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000, + 0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101, + 0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202, + 0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102, + 0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100, + 0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101, + 0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100, + 0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201, + 0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101, + 0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202, + 0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200, + 0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201, + 0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200, + 0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002, + 0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201, + 0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101, + 0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201, + 0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201, + 0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102, + 0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101, + 0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101, + 0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101, + 0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001, + 0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000, + 0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102, + 0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101, + 0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202, + 0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202, + 0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101, + 0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000, + 0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101, + 0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202, + 0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100, + 0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000, + 0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101, + 0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202, + 0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100, + 0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100, + 0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002, + 0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100, + 0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101, + 0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202, + 0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200, + 0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100, + 0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200, + 0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002, + 0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001, + 0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101, + 0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101, + 0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202, + 0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102, + 0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100, + 0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101, + 0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100, + 0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101, + 0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101, + 0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101, + 0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101, + 0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102, + 0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100, + 0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102, + 0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101, + 0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101, + 0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001, + 0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101, + 0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202, + 0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102, + 0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001, + 0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102, + 0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200, + 0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101, + 0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001, + 0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201, + 0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202, + 0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102, + 0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002, + 0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200, + 0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100, + 0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001, + 0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002, + 0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201, + 0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101, + 0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100, + 0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000, + 0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200, + 0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101, + 0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200, + 0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202, + 0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100, + 0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102, + 0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102, + 0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102, + 0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101, + 0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101, + 0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000, + 0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202, + 0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102, + 0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200, + 0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101, + 0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101, + 0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100, + 0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202, + 0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101, + 0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201, + 0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001, + 0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101, + 0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200, + 0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002, + 0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001, + 0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000, + 0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101, + 0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202, + 0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100, + 0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102, + 0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200, + 0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101, + 0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201, + 0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000, + 0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202, + 0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201, + 0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200, + 0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002, + 0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101, + 0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100, + 0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001, + 0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201, + 0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000, + 0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102, + 0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001, + 0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201, + 0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100, + 0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002, + 0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001, + 0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101, + 0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002, + 0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000, + 0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101, + 0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100, + 0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200, + 0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200, + 0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102, + 0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200, + 0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002, + 0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100, + 0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001, + 0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001, + 0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102, + 0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202, + 0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202, + 0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000, + 0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101, + 0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202, +}; +#else +static const uint32_t iq1s_grid_us[2048] = { + 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000, + 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101, + 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200, + 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212, + 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011, + 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111, + 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220, + 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022, + 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220, + 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101, + 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110, + 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111, + 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010, + 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210, + 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221, + 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021, + 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002, + 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101, + 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101, + 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211, + 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110, + 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022, + 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121, + 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220, + 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001, + 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101, + 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102, + 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012, + 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010, + 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111, + 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122, + 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222, + 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001, + 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102, + 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101, + 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000, + 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101, + 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112, + 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110, + 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211, + 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012, + 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111, + 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120, + 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122, + 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121, + 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221, + 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001, + 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101, + 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101, + 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011, + 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111, + 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011, + 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122, + 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121, + 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222, + 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101, + 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000, + 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200, + 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110, + 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112, + 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222, + 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021, + 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121, + 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201, + 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200, + 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101, + 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011, + 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010, + 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211, + 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121, + 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000, + 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202, + 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202, + 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211, + 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112, + 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020, + 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121, + 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222, + 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102, + 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100, + 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110, + 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011, + 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111, + 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110, + 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121, + 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222, + 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201, + 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102, + 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201, + 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012, + 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010, + 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010, + 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110, + 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011, + 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212, + 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021, + 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021, + 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021, + 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101, + 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101, + 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100, + 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010, + 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111, + 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010, + 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111, + 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120, + 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120, + 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101, + 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001, + 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201, + 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210, + 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211, + 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111, + 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112, + 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211, + 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010, + 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021, + 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122, + 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221, + 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102, + 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100, + 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101, + 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101, + 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101, + 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012, + 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110, + 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112, + 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210, + 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210, + 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210, + 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010, + 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110, + 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122, + 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020, + 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021, + 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022, + 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120, + 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222, + 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221, + 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001, + 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102, + 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201, + 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012, + 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111, + 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012, + 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110, + 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110, + 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121, + 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221, + 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220, + 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222, + 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000, + 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201, + 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012, + 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011, + 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212, + 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221, + 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121, + 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202, + 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202, + 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002, + 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101, + 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210, + 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112, + 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011, + 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011, + 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210, + 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020, + 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220, + 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222, + 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222, + 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001, + 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010, + 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111, + 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010, + 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110, + 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221, + 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122, + 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202, + 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100, + 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101, + 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112, + 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111, + 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211, + 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222, + 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221, + 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022, + 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101, + 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211, + 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111, + 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111, + 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010, + 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121, + 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222, + 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000, + 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202, + 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000, + 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202, + 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110, + 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110, + 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222, + 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120, + 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022, + 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101, + 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202, + 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110, + 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110, + 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111, + 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111, + 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120, + 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121, + 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001, + 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202, + 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001, + 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200, + 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011, + 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212, + 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012, + 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110, + 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012, + 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111, + 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020, + 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121, + 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222, + 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102, + 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102, + 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101, + 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212, + 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210, + 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111, + 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212, + 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221, + 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121, + 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002, + 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000, + 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202, + 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112, + 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111, + 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020, + 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221, + 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022, + 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100, + 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201, + 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112, + 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211, + 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012, + 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121, + 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020, + 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120, + 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200, + 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200, + 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110, + 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011, + 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222, + 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020, + 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222, +}; +#endif + +#ifndef HAVE_FANCY_SIMD +const uint64_t keven_signs[128] = { + 0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff, + 0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff, + 0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff, + 0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff, + 0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff, + 0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff, + 0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff, + 0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff, + 0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff, + 0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff, + 0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff, + 0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff, + 0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff, + 0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff, + 0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff, + 0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff, + 0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff, + 0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff, + 0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff, + 0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff, + 0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff, + 0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff, + 0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff, + 0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff, + 0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff, + 0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff, + 0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff, + 0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff, + 0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff, + 0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff, + 0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff, + 0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff, +}; +#endif + +} + +/* moonll change mulmat +add typeB and strideB +}*/ + +bool iqk_mul_mat(long Nx, long Ny, long ne00, + int typeA, const void * A, long strideA, + int typeB, const void * B, long strideB, + float * C, long stride_C, int ith, int nth) { + + MulMat mm; +#if defined __x86_64__ || defined(_M_X64) + if (!MulMat::set_mul_mat(typeA, typeB, (int)ne00, mm, Ny)) { + return false; + } +#else + int row_size_q8; + if (!MulMat::set_mul_mat(typeA, (int)ne00, mm, row_size_q8, Ny)) { + return false; + } +#endif + + + size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA)); + size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB)); + + + auto nrc_x = (Nx + nth - 1)/nth; + auto first_x = ith*nrc_x; + if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; + + DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0}; +#ifdef __ARM_NEON +#ifdef GEMM_Q4K_Q6K + if (Ny >= 8 && (typeA == GGML_TYPE_Q4_K || typeA == GGML_TYPE_Q6_K)) { + mm.mul_mat_NxM_v2(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); + } else +#endif +#endif + { + mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); + } + + return true; +} + + +bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B, + float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) { + const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping; + assert(row_mapping != nullptr); + + MulMat mm; + int row_size_q8; + /* moonll + + if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) { + return false; + }*/ + int row_size_qx = ggml_row_size((ggml_type)typeA, ne00); + int nrc_x = (Nx + nth - 1)/nth; + int first_x = ith*nrc_x; + if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; + DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)}; + mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); + return true; +} + +#if defined __x86_64__ || defined(_M_X64) + +#if defined HAVE_FANCY_SIMD + #undef HAVE_FANCY_SIMD +#endif +#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) + #define HAVE_FANCY_SIMD +#endif +//#define HAVE_FANCY_SIMD + +namespace { + +inline float hsum_float_4(__m128 x) { + x = _mm_add_ps(x, _mm_movehl_ps(x, x)); + x = _mm_add_ss(x, _mm_movehdup_ps(x)); + return _mm_cvtss_f32(x); +} +inline float hsum_float_8(__m256 x) { + return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1))); +} + +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + + +template struct Q8 { + + constexpr static int nrc_y = nrc; + + Q8(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); + } + +#ifdef HAVE_FANCY_SIMD + inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); } +#endif + inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); } + inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); } + inline float scale(int iy, int i) const { return y[iy][i].d; } + + const block_q8 * y[nrc_y]; +}; + +// Handles q4_K and q5_K scales/mins +struct Scales8K { + template + inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { + make_q4_scales(data, utmp); + const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); + const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1); + accum_mins(mins128, q8, i, c, accd); + const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); + return MM256_SET_M128I(sc128, sc128); + } +#ifdef HAVE_FANCY_SIMD + template + inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { + auto scales = process_mins_and_scales(data, c, i, q8, accd); + return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1); + } +#endif + template + inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { + const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i q8s = q8.load_bsums(iy, i); + const __m256i prod = _mm256_madd_epi16(mins, q8s); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + } +#ifdef HAVE_FANCY_SIMD + const __m512i shuffles512[2] = { + _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302, + 0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100), + _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, + 0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908) + }; +#endif + const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), + _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; + + uint32_t utmp[4]; +}; + +template +inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i prod = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i)); + accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]); + } +} +inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) { + const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); + const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); + scales[0] = MM256_SET_M128I(l_scales, l_scales); + scales[1] = MM256_SET_M128I(h_scales, h_scales); +} + +struct ScaleQ3 { + inline __m128i make_scales(const uint16_t * s8) const { + const uint16_t * scales16 = (const uint16_t *)s8; + uint32_t aux0 = scales16[0] | (scales16[1] << 16); + uint32_t aux1 = scales16[2] | (scales16[3] << 16); + uint32_t aux2 = scales16[4] | (scales16[5] << 16); + __m128i scales128 = _mm_set_epi32( + ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030), + ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030), + (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030), + (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030)); + return _mm_add_epi8(scales128, m32); + } + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct ScaleIQ4XS { + inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) { + uint32_t tmp32 = scales_h | (scales_h << 14); + const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4); + const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask); + return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32); + } + const __m128i hshift = _mm_set_epi32(12, 8, 4, 0); + const __m128i lshift = _mm_set_epi32(4, 0, 4, 0); + const __m128i hmask = _mm_set1_epi16(0x03); + const __m128i lmask = _mm_set1_epi8(0xf); + const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400); + const __m128i m32 = _mm_set1_epi16(-32); +}; + +struct Scales8KBase { + template + inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { + const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i q8s = q8.load_bsums(iy, i); + const __m256i prod = _mm256_madd_epi16(mins, q8s); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + } + inline __m256i shuffle(__m128i mins) const { + return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0])); + } + const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), + _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; +}; + +template +struct BaseDequantizer { + BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {} + inline void new_row(int ix) { + x = (const Block *)((const char *)vx + bx*ix); + } + + const void * vx; + size_t bx; + const Block * x; + + float d; +}; + +__m128i inline load_iq4nl_values_128() { + static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; + return _mm_loadu_si128((const __m128i *)kvalues_iq4nl); +} + +__m256i inline load_iq4nl_values_256() { + auto val128 = load_iq4nl_values_128(); + return MM256_SET_M128I(val128, val128); +} + +#ifdef HAVE_FANCY_SIMD +//====================================== Zen4 ================================================== + +struct BlockPermuter { + const __m512i permute1 = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4); +}; + +struct Q4Bits { + inline void prepare(const uint8_t * q4) { + auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); + auto tmp1 = _mm512_and_si512(q4bits, ml); + auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); + values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); + q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); + tmp1 = _mm512_and_si512(q4bits, ml); + tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); + values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); + } + inline void prepare64(const uint8_t * q4) { + auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); + values[0] = _mm512_and_si512(q4bits, ml); + values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); + values[2] = _mm512_and_si512(q4bits, ml); + values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + } + __m512i values[4]; + const __m512i ml = _mm512_set1_epi8(0xf); + BlockPermuter perm; +}; + +struct Q2Bits { + inline void prepare(const uint8_t * q2) { + + auto q2bits = _mm512_loadu_si512((const __m512i*)q2); + auto tmp = _mm512_srli_epi16(q2bits, 2); + + values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp); + values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp); + values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml); + values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml); + values[0] = _mm512_and_si512(values[0], ml); + values[2] = _mm512_and_si512(values[2], ml); + } + __m512i values[4]; + const __m512i ml = _mm512_set1_epi8(0x03); + BlockPermuter perm; +}; + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); + } + + Q4Bits bits; + Scales8K s8k; +}; + +/* +moonll DequantizerIQ4XS +*/ + +__m512i inline load_iq4nl_values_512() { + auto val256 = load_iq4nl_values_256(); + return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1); +} + +struct DequantizerIQ4XS final : public BaseDequantizer { + DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + prepare(x[i].qs); + auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); + s8k.accum_mins(scales128, q8, i, -128.f*d, accd); + auto scales256 = MM256_SET_M128I(scales128, scales128); + auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1); + scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]); + scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]); + scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]); + } + inline void prepare(const uint8_t * q4) { + bits.prepare64(q4); + // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111 + // bits.valuse[1]: 16..31, 48...63, 80...95, 112..127 + // etc. + auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]); + bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1])); + bits.values[0] = _mm512_shuffle_epi8(values, tmp); + tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]); + bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3])); + bits.values[2] = _mm512_shuffle_epi8(values, tmp); + } + + Q4Bits bits; + Scales8KBase s8k; + ScaleIQ4XS siq4; + const __m512i values; + const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + const __m512i shuffles[4] = { + _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1), + }; +}; + +struct HighBit5 { + inline void apply(const uint8_t * h, Q4Bits& bits) { + auto hbits256 = _mm256_loadu_si256((const __m256i *)h); + auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); + } + const __m512i mh = _mm512_set1_epi8(0x10); +}; + +struct HighBit3 { + inline void apply(const uint8_t * h, Q2Bits& bits) { + auto hbits256 = _mm256_loadu_si256((const __m256i *)h); + auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh)); + } + const __m512i mh = _mm512_set1_epi8(0x04); +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + hbits.apply(x[i].qh, bits); + auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); + } + + Q4Bits bits; + HighBit5 hbits; + Scales8K s8k; +}; + +struct Scale16 { + inline void make_scales(const __m128i& scales8, __m512i * scales) const { + auto all_scales8 = MM256_SET_M128I(scales8, scales8); + auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1); + auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2); + scales[0] = _mm512_cvtepi8_epi16(scales1); + scales[1] = _mm512_cvtepi8_epi16(scales2); + } + template + inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8, + const Q8& q8, __m256 * accm, __m512i * scales) const { + process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm); + make_scales(scales8, scales); + } + const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202, + 0x05050505, 0x01010101, 0x04040404, 0x00000000); + const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a, + 0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808); +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales); + } + + Q2Bits bits; + Scale16 sc16; + const __m128i m4 = _mm_set1_epi8(0xf); + +}; + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + hbits.apply(x[i].hmask, bits); + auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales); + sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales); + } + + Q2Bits bits; + HighBit3 hbits; + ScaleQ3 sc3; + Scale16 sc16; + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare64(x[i].ql); + add_high_bits(x[i].qh, bits); + auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales); + sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales); + } + + inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const { + auto hbits = _mm512_loadu_si512((const __m512i *)qh); + auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh); + auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); + tmp1 = _mm512_and_si512(hbits, mh); + tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); + } + + Q4Bits bits; + HighBit3 hbits; + Scale16 sc16; + + const __m512i mh = _mm512_set1_epi8(0x30); + +}; + +template +static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} +template +inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); +} + +template +static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} + +template +static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[4]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(), + p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} + +template +static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + constexpr int k_nx = 2; + + Q8<1> q8(info); + + Dequantizer deq1(vx, bx); + Dequantizer deq2(vx, bx); + + Dequantizer * deq[k_nx]; + deq[0] = &deq1; + deq[1] = &deq2; + + __m512i scales[2*k_nx]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + auto accd = _mm512_setzero_ps(); + auto accm = _mm256_setzero_ps(); + + for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix); + + for (int i = 0; i < nb/k_nx; ++i) { + + for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx); + + for (int kx = 0; kx < k_nx; ++kx) { + compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd); + } + + } + if (2*(nb/2) < nb) { + int i0 = 2*(nb/2); + deq[0]->new_block(i0, q8, &accm, scales); + compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd); + } + + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1)); + info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256))); + } +} + +#else +// ===================================== Vanilla AVX2 ===================================== + +struct Q4Bits { + inline void prepare(const uint8_t * q4, int j) { + auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); + values[0] = _mm256_and_si256(q4bits, ml); + values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); + values[2] = _mm256_and_si256(q4bits, ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + } + inline void prepare64(const uint8_t * q4, int j) { + auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); + values[0] = _mm256_and_si256(q4bits, ml); + values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); + values[1] = _mm256_and_si256(q4bits, ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + } + inline void prepare16(const uint8_t * q4, int j) { + values[0] = dequant16(q4 + 64*j + 0); + values[1] = dequant16(q4 + 64*j + 16); + values[2] = dequant16(q4 + 64*j + 32); + values[3] = dequant16(q4 + 64*j + 48); + } + inline __m256i dequant16(const uint8_t * qs) const { + const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); + const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128); + return _mm256_and_si256(ml, aux256); + }; + __m256i values[4]; + const __m256i ml = _mm256_set1_epi8(0xf); +}; + +struct Q2Bits { + inline void prepare(const uint8_t * q2, int j) { + auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j); + values[0] = _mm256_and_si256(q2bits, ml); + values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml); + values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml); + } + __m256i values[4]; + const __m256i ml = _mm256_set1_epi8(0x03); +}; + +struct HighBit5 { + inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } + inline void apply(Q4Bits& bits, bool do_shift) { + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); + if (do_shift) { + hbits = _mm256_srli_epi16(hbits, 4); + } + } + const __m256i mh = _mm256_set1_epi8(0x10); + __m256i hbits; +}; + +struct HighBit3 { + inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } + inline void apply(Q2Bits& bits, bool do_shift) { + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh)); + if (do_shift) { + hbits = _mm256_srli_epi16(hbits, 4); + } + } + const __m256i mh = _mm256_set1_epi8(0x04); + __m256i hbits; +}; + + +/* +template +inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { + if (j == 0) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); + } + } else { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); + } + } +}*/ + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + } + + Q4Bits bits; + Scales8K s8k; +}; + +struct DequantizerIQ4XS final : public BaseDequantizer { + DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); + s8k.accum_mins(scales128, q8, i, -128.f*d, accd); + return MM256_SET_M128I(scales128, scales128); + } + inline void prepare(int i, int j) { + bits.prepare16(x[i].qs, j); + bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]); + bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]); + bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]); + bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]); + } + + static __m256i load_values() { + static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; + auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + return MM256_SET_M128I(val128, val128); + } + + Q4Bits bits; + Scales8K s8k; + ScaleIQ4XS siq4; + const __m256i values; +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + hbits.load(x[i].qh); + return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + hbits.apply(bits, j == 0); + } + + Q4Bits bits; + HighBit5 hbits; + Scales8K s8k; +}; + +template +inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d, + __m256 * accm, __m256i * scales) { + const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); + process_mins_16(all_scales, q8, i, d, accm); + prepare_scales_16(all_scales, scales); +} + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + hbits.load(x[i].hmask); + process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + hbits.apply(bits, j == 0); + } + + Q2Bits bits; + HighBit3 hbits; + ScaleQ3 sc3; + + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm); + prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + } + + Q2Bits bits; + + const __m128i m4 = _mm_set1_epi8(0xf); +}; + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales); + } + inline void prepare(int i, int j) { + bits.prepare64(x[i].ql, j); + auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j); + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh)); + } + + Q4Bits bits; + const __m256i mh = _mm256_set1_epi8(0x30); +}; + +inline __m256i get_scale_shuffle_8(int i); + +inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales); + +inline __m256i get_scale_shuffle_16(int i); + +inline void set_scales_16(const __m256i& all_scales, __m256i* scales); + + +template +static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%QK_K == 0); + const int nb = n/QK_K; + + Q8 q8(info); + + __m256i all_scales[2]; + __m256i scales[4]; + __m256 accd[nrc_y]; + + Dequantizer deq(vx, bx); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accd, all_scales); + + __m256i sumi[nrc_y]; + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j); + set_scales_16(all_scales[j], scales); + multiply_add(deq.bits, scales, j, i, q8, sumi); + } + + for (int iy = 0; iy < nrc_y; ++iy) { + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + + } + +} + +template +static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accd[nrc_y]; + __m256i scales[4]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + auto all_scales = deq.new_block(i, q8, accd); + + __m256i sumi[nrc_y]; + + for (int j = 0; j < QK_K/128; ++j) { + + deq.prepare(i, j); + + set_scales_8(all_scales, j, scales); + + multiply_add(deq.bits, scales, j, i, q8, sumi); + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); + accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + + } +} +#endif // Zen4 or vanilla AVX2 + + + +// +// ============================== Legacy quants +// + +struct DotHelper { + const __m256i m1 = _mm256_set1_epi16(1); +#if defined(__AVX512VNNI__) && defined(__AVX512VL__) + inline __m256i dot(__m256i x, __m256i y) const { + return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y); + } +#else + inline __m256i dot(__m256i x, __m256i y) const { + return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y)); + } +#endif +}; + +struct SignedDot { + DotHelper helper; + inline __m256i compute(__m256i x, __m256i y) const { + return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x)); + } +}; +struct UnsignedDot { + DotHelper helper; + inline __m256i compute(__m256i x, __m256i y) const { + return helper.dot(x, y); + } +}; +template struct Sum4 { + Dot dot; + inline __m256i compute(const __m256i * qx, const Q8 * y) const { + const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); + const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); + const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); + const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); + const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 + const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 + return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 + } +}; + +struct Sum4_Q8 { + SignedDot dot; + static inline __m256i add1(__m256i a, __m256i b) { + return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b)); + } + static inline __m256i add2(__m256i a, __m256i b) { + return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b)); + } + inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const { + const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); + const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); + const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); + const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); + const __m256i p01 = add1(p0, p1); // 0,1, 0,1, 0,1, 0,1 + const __m256i p23 = add1(p2, p3); // 2,3, 2,3, 2,3, 2,3 + return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3 + } +}; + +struct ScaleHelperQ_0 { + ggml_half scales8[4]; + template + inline __m128 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; + return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); + } + template + inline __m128 prepare4(__m128 other_scales, const Q * y) { + return _mm_mul_ps(other_scales, prepare4(y)); + } + template inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); } + template inline float prepare1(float d, const Q * y) const { return d*prepare1(y); } +}; +template +struct ScaleHelperQ_0_1 { + ggml_half scales8[4]; + template + inline __m256 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; + auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); + return _mm256_set_m128(_mm_mul_ps(s4, min), s4); + } + template + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm_mul256_ps(other_scales, prepare4(y)); + } + template inline std::pair prepare1(const Q * y) const { + float d = GGML_FP16_TO_FP32(y->d); + return std::make_pair(d, -d*float(min_value)); + } + std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); + } + const __m128 min = _mm_set1_ps(float(-min_value)); +}; + +struct ScaleHelperQ_1 { + uint32_t scales8[4]; + const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); + + template + inline __m256 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) { + // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers + // complain that this breaks strict-aliasing rules. + memcpy(scales8 + j, &y[j].d, sizeof(uint32_t)); + } + return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle)); + } + + template + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm256_mul_ps(other_scales, prepare4(y)); + } + + template inline std::pair prepare1(const Q * y) const { + return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m)); + } + template inline std::pair prepare1(const std::pair& dm, const Q * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m)); + } + std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); + } +}; + +struct MinusType0 { + inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); } + inline float compute(float d, int) const { return d; } + inline float result(__m256 acc, int) const { return hsum_float_8(acc); } +}; + +template struct MinusType1 { + __m128 accm[nrc_y]; + MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); } + inline __m256 compute(__m256 dm, int iy) { + const __m128 d = _mm256_castps256_ps128(dm); + const __m128 m = _mm256_extractf128_ps(dm, 1); + accm[iy] = _mm_add_ps(accm[iy], m); + return _mm256_set_m128(d, d); + } + inline float compute(const std::pair& dm, int iy) { + accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f)); + return dm.first; + } + inline float result(__m256 acc, int iy) const { + const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); + return hsum_float_4(_mm_add_ps(sum, accm[iy])); + } +}; + +template struct AccumT { + __m256 acc[nrc_y]; + Minus accm; + AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); } + template + inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) { + auto qx = unp.quants(); + __m256 dall[nrc_y]; + for (int i = 0; i < nb/4; ++i) { + auto other_scales = unp.set_block_4(i); + for (int iy = 0; iy < nrc_y; ++iy) { + auto s12 = scales.prepare4(other_scales, y[iy] + 4*i); + dall[iy] = accm.compute(s12, iy); + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto pall = sum.compute(qx, y[iy] + 4*i); + acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]); + } + } + if (!is_multiple_of_4) { + for (int i = 4*(nb/4); i < nb; ++i) { + auto other_scales = unp.set_block(i); + for (int iy = 0; iy < nrc_y; ++iy) { + auto s12 = scales.prepare1(other_scales, y[iy] + i); + auto d = accm.compute(s12, iy); + const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs)); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, accm.result(acc[iy], iy)); + //s[iy*bs] = accm.result(acc[iy], iy); + } + } +}; + +template +using AccumType0 = AccumT; + +template +using AccumType1 = AccumT, nrc_y, is_multiple_of_4>; + +using Sum4Type0 = Sum4; +using Sum4Type1 = Sum4; + +template +void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { + Unpacker unp(vx, bx); + Sum4Type sum4; + Scales scales; + for (int ix = 0; ix < nrc_x; ++ix) { + unp.set_row(ix); + AccumType accum; + accum.compute(nb, unp, scales, sum4, y, info, ix); + } +} + +template +void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + +template +void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + +struct Dequantizer4bit { + const __m256i m4 = _mm256_set1_epi8(0xf); + inline __m256i dequant(const uint8_t * qs) const { + const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); + return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4); + } +}; + +struct Q8_0_Dequantizer { + inline __m256i dequant(const block_q8_0 * x) const { + return _mm256_loadu_si256((const __m256i *)x->qs); + } +}; + +struct Q8_0_1_Dequantizer { + inline __m256i dequant(const block_q8_0 * x) const { + return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs)); + } +}; + +struct Q4_0_Dequantizer { + Dequantizer4bit b4; + const __m256i m8 = _mm256_set1_epi8(-8); + inline __m256i dequant(const block_q4_0 * x) const { + return _mm256_add_epi8(b4.dequant(x->qs), m8); + } +}; + +struct Q4_1_Dequantizer { + Dequantizer4bit b4; + inline __m256i dequant(const block_q4_1 * x) const { + return b4.dequant(x->qs); + } +}; + +struct HBitDequantizer { + const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); + const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + const __m256i minus1 = _mm256_set1_epi64x(-1); + inline __m256i to_bytes(const uint8_t * bits) const { + // Note: Data in all ggml quants is at least 2-byte aligned. + // => we can cast to uint16_t and use or on two consecutive entries + // which is faster than memcpy + const uint16_t * aux16 = (const uint16_t *)bits; + const uint32_t aux32 = aux16[0] | (aux16[1] << 16); + //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t)); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle); + bytes = _mm256_or_si256(bytes, mask); + return _mm256_cmpeq_epi8(bytes, minus1); + } +}; + +struct Q5_0_Dequantizer { + Dequantizer4bit b4; + HBitDequantizer hbit; + const __m256i mh = _mm256_set1_epi8((char)0xF0); + inline __m256i dequant(const block_q5_0 * x) const { + const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh); + return _mm256_or_si256(b4.dequant(x->qs), vqh); + } +}; + +struct Q5_1_Dequantizer { + Dequantizer4bit b4; + HBitDequantizer hbit; + const __m256i mh = _mm256_set1_epi8(0x10); + inline __m256i dequant(const block_q5_1 * x) const { + const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh); + return _mm256_or_si256(b4.dequant(x->qs), vqh); + } +}; + +template +struct Q_Unpacker { + Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {} + + const char * cx_0; + const Q * x; + size_t bx; + + Scales scales; + Dequantizer deq; + + __m256i qx[4]; + + inline const __m256i* quants() const { return qx; } + + inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); } + + inline auto set_block_4(int i) { + for (int j = 0; j < 4; ++j) { + qx[j] = deq.dequant(x + 4*i + j); + } + return scales.prepare4(x + 4*i); + } + inline auto set_block(int i) { + qx[0] = deq.dequant(x + i); + return scales.prepare1(x + i); + } +}; + +struct Q8_0_Unpacker final : public Q_Unpacker { + Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_0; } +}; +struct Q8_0_1_Unpacker final : public Q_Unpacker, Q8_0_1_Dequantizer> { + Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} +// using Sum4T = Sum4TypeQ81; + inline static int block_size() { return QK8_0; } +}; +struct Q4_0_Unpacker final : public Q_Unpacker { + Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_0; } +}; +struct Q5_0_Unpacker final : public Q_Unpacker { + Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK5_0; } +}; +struct Q4_1_Unpacker final : public Q_Unpacker { + Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_1; } +}; +struct Q5_1_Unpacker final : public Q_Unpacker { + Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_1; } +}; + +template +void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Q8_0_Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Q8_0_Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + + + + +/* +moonll +add some structs for DequantizerIQ2XXS +SimpleBits +EvenSignHelper +*/ +struct SimpleBits { + __m256i values[4]; +}; + +// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测 +#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__) +#define HAVE_AVX512_POPCNT 1 +#else +#define HAVE_AVX512_POPCNT 0 +#endif + +struct EvenSignHelper { + #if defined HAVE_FANCY_SIMD + // #pragma message("Using AVX512VPOPCNTDQ in even sign helper") + union sbits_t { + __m128i vec; + __mmask32 mask[4]; + }; + IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const { + aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask); + + // fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现 + #if HAVE_AVX512_POPCNT + auto pcnt = _mm256_popcnt_epi32(aux); + + #else + // 提供替代实现,使用标准的位计数方法 + __m256i pcnt; + int* pcnt_ptr = reinterpret_cast(&pcnt); + int* aux_ptr = reinterpret_cast(&aux); // 直接获取 aux 的地址,避免不必要的复制 + + #pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量 + for (int i = 0; i < 8; i++) { + pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount + } + #endif + + sbits_t sbits; + sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); + values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]); + values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]); + //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); + //const __mmask32 * m32 = (const __mmask32 *)&sign_bits; + //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]); + //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]); + } + const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0); + const __m256i mask = _mm256_set1_epi32(127); + const __m256i mone = _mm256_set1_epi32(1); + #else + inline void sign_value(uint32_t aux32, __m256i& value) const { + auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127], + keven_signs[(aux32 >> 7) & 127], keven_signs[(aux32 >> 0) & 127]); + value = _mm256_sign_epi8(value, signs); + } + #endif +}; + +/* +moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1 +add func +get_scale_shuffle_8 +get_scale_shuffle_16 +set_scales_16 +*/ + +inline __m256i get_scale_shuffle_8(int i) { + return _mm256_set1_epi16((2*i) | ((2*i+1) << 8)); +} + +inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) { + scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0)); + scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1)); + scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2)); + scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3)); +} + + +inline __m256i get_scale_shuffle_16(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} + +inline void set_scales_16(const __m256i& all_scales, __m256i * scales) { + scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0)); + scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1)); + scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2)); + scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3)); +} + + +template +inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { + if (j == 0) { +#ifdef HAVE_FANCY_SIMD + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + } +#else + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); + } +#endif + } else { +#ifdef HAVE_FANCY_SIMD + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + } +#else + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); + } +#endif + } +} + +/* +moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1 +add func +set_scales_8_iq +set_scales_16_iq + +add MUL_MAT +mul_mat_qX_K_q8_K_IQ_1 +mul_mat_qX_K_q8_K_IQ_N +mul_mat_qX_K_q8_K_IQ +*/ + +template +inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) { + if (j == 0) { +#ifdef HAVE_FANCY_SIMD + auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); + auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); + auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); + auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); + sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2)); + sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4)); +#else + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); + sumi[0] = _mm256_add_epi32(p1, p3); + sumi[1] = _mm256_add_epi32(p2, p4); +#endif + } else { +#ifdef HAVE_FANCY_SIMD + auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); + auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); + auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); + auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); + sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2)); + sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4)); +#else + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); + sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3)); + sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4)); +#endif + } +} + + +inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) { + //#ifdef HAVE_FANCY_SIMD + auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100) + : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908); + scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); + scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4))); + //#else + // set_scales_8(all_scales, j, scales); + //#endif + } + +inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) { + #ifdef HAVE_FANCY_SIMD + auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100); + scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); + scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8))); + #else + set_scales_16(all_scales, scales); + #endif + } + +template +static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const int nb = n / QK_K; + Q8<1> q8(info); + Dequantizer deq(vx, bx); + __m256i scales[2]; + __m256i q8_quants[4]; + for (int ix = 0; ix < nrc_x; ++ix) { + + __m256 accd = _mm256_setzero_ps(); + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + __m256i sumi[2], all_scales[Dequantizer::num_blocks/8]; + deq.new_block(i, all_scales); + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j, q8, q8_quants); + if constexpr (Dequantizer::num_blocks == 8) { + set_scales_8_iq(j, all_scales[0], scales); + } else { + set_scales_16_iq(all_scales[j], scales); + } + multiply_add_1(j, deq.bits, scales, q8_quants, sumi); + } + accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd); + } + + info.store(ix, 0, hsum_float_8(accd)); + } + } + + +template +static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const int nb = n / QK_K; + Q8 q8(info); + Dequantizer deq(vx, bx); + __m256i scales[4]; + __m256 accd[nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8]; + //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256(); + __m256i mins; + float dmin = deq.new_block(i, all_scales, mins); + for (int iy = 0; iy < nrc_y; ++iy) { + auto bsums = q8.load_bsums(iy, i); + auto prod = _mm256_madd_epi16(mins, bsums); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j); + if constexpr (Dequantizer::num_blocks == 8) { + set_scales_8(all_scales[0], j, scales); + } else { + set_scales_16(all_scales[j], scales); + } + //multiply_add_iq(deq.bits, scales, j, i, q8, sumi); + multiply_add(deq.bits, scales, j, i, q8, sumi); + } + for (int iy = 0; iy < nrc_y; ++iy) { + const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); + accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + } +} + +template +static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); +#ifdef HAVE_FANCY_SIMD + if constexpr (nrc_y == 1) { + mul_mat_qX_K_q8_K_IQ_1(n, vx, bx, info, nrc_x); + } else { + mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); + } +#else + mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); +#endif +} + +/* +moonll iq1s +core func for iq1s mul_mat_iq1_s_q8_K + +*/ + +template +static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(n%QK_K == 0); + Q8 q8(info); + __m256i qx[8]; + __m256i scales[4]; + __m256 acc[nrc_y] = {}; + auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000 + __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100); + for (int ix = 0; ix < nrc_x; ++ix) { + auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx); + for (int ibl = 0; ibl < n/QK_K; ++ibl) { + float d = GGML_FP16_TO_FP32(iq1s[ibl].d); + auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh); + auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7)); + scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1)); +#ifdef HAVE_FANCY_SIMD + auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask); + auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9)); +#else + auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask); + auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7))); +#endif + deltas128 = _mm_mullo_epi16(scales128, deltas128); + scales128 = _mm_slli_epi16(scales128, 3); + auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128); + auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128); + auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7 + auto all_scales = MM256_SET_M128I(scales128, scales128); + auto shuffle = shuffle0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle); + shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)); + } + const uint8_t * qs = iq1s[ibl].qs; + const uint16_t * qh = iq1s[ibl].qh; + for (int ib = 0; ib < QK_K/32; ib += 2) { + qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)], + iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)], + iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]); + qs += 8; + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto bsums = q8.load_bsums(iy, ibl); + auto sumi = _mm256_setzero_si256(); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0); + auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1); +#ifdef HAVE_FANCY_SIMD + auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1); + auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2); + sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2)); +#else + auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1); + auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2); + auto dot = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2)); + sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot)); +#endif + } +#ifdef HAVE_FANCY_SIMD + sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas); +#else + sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas)); +#endif + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]); + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, 0.125f*hsum_float_8(acc[iy])); + acc[iy] = _mm256_setzero_ps(); + } + } +} + +/* +moonll iq1s +DequantizerIQ2XXS +DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S +*/ + +struct DequantizerIQ2XXS final : public BaseDequantizer { + DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + constexpr static int num_blocks = 8; + + union Data { + __m256i vec; + uint32_t val[8]; + }; + + inline __m128i load_scales(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + const uint16_t * a16 = (const uint16_t *)x[i].qs; + auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12); + return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1)); + } + + inline void new_block(int i, __m256i * scales) { + auto sc16 = load_scales(i); + scales[0] = MM256_SET_M128I(sc16, sc16); + } + inline float new_block(int i, __m256i * scales, __m256i& mins) { + auto sc16 = load_scales(i); + mins = scb.shuffle(sc16); + scales[0] = MM256_SET_M128I(sc16, sc16); + return -d*minv; + } + + inline static void make4(const uint32_t * aux32, __m256i * values) { + const uint8_t * aux8 = (const uint8_t *)aux32; + values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]); + values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]); + values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]); + values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]); + } + + IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const { +#ifdef HAVE_FANCY_SIMD + esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0); + esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2); +#else + esh.sign_value(aux32[1], values[0]); + esh.sign_value(aux32[3], values[1]); + esh.sign_value(aux32[5], values[2]); + esh.sign_value(aux32[7], values[3]); +#endif + } + inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const { + make4(aux32, values); + sign_values(aux32, values); + for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value); + } + inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const { + make4(aux32, values); + sign_values(aux32, q8); + } + inline void prepare(int i, int j) { + Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); + make4_signed(data.val, min_value, bits.values); + } + inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) { + for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k); + Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); + make4(data.val, bits.values, q8_quants); + } + + constexpr static int minv = 43; + SimpleBits bits; + Scales8KBase scb; + EvenSignHelper esh; + const __m256i min_value = _mm256_set1_epi8(minv); + const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1); +}; + +/* +moonll +add Q8_0_Unpacker && DequantizerIQ2XXS support +add func mul_mat_qX_K_q8_K_IQ +*/ + +template void MulMat::set_functions(MulMat& m) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_0_q8_0_T; + m.funcs[1] = mul_mat_qX_0_q8_0_T; + m.funcs[2] = mul_mat_qX_0_q8_0_T; + m.funcs[3] = mul_mat_qX_0_q8_0_T; + m.funcs[4] = mul_mat_qX_0_q8_0_T; + m.funcs[5] = mul_mat_qX_0_q8_0_T; + m.funcs[6] = mul_mat_qX_0_q8_0_T; + m.funcs[7] = mul_mat_qX_0_q8_0_T; + } + else if constexpr (std::is_same_v || std::is_same_v|| std::is_same_v) { + m.funcs[0] = mul_mat_qX_1_q8_1_T; + m.funcs[1] = mul_mat_qX_1_q8_1_T; + m.funcs[2] = mul_mat_qX_1_q8_1_T; + m.funcs[3] = mul_mat_qX_1_q8_1_T; + m.funcs[4] = mul_mat_qX_1_q8_1_T; + m.funcs[5] = mul_mat_qX_1_q8_1_T; + m.funcs[6] = mul_mat_qX_1_q8_1_T; + m.funcs[7] = mul_mat_qX_1_q8_1_T; + } + else if constexpr (std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQ; + m.funcs[1] = mul_mat_qX_K_q8_K_IQ; + m.funcs[2] = mul_mat_qX_K_q8_K_IQ; + m.funcs[3] = mul_mat_qX_K_q8_K_IQ; + m.funcs[4] = mul_mat_qX_K_q8_K_IQ; + m.funcs[5] = mul_mat_qX_K_q8_K_IQ; + m.funcs[6] = mul_mat_qX_K_q8_K_IQ; + m.funcs[7] = mul_mat_qX_K_q8_K_IQ; + } + else { +#ifdef HAVE_FANCY_SIMD + if constexpr (std::is_same_v) { + m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512; + } else { + m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; + m.funcs[1] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[2] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[3] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[4] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[5] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[6] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[7] = mul_mat_qX_K_q8_K_AVX512; + } +#else + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qY_K_q8_K_T; + m.funcs[1] = mul_mat_qY_K_q8_K_T; + m.funcs[2] = mul_mat_qY_K_q8_K_T; + m.funcs[3] = mul_mat_qY_K_q8_K_T; + m.funcs[4] = mul_mat_qY_K_q8_K_T; + m.funcs[5] = mul_mat_qY_K_q8_K_T; + m.funcs[6] = mul_mat_qY_K_q8_K_T; + m.funcs[7] = mul_mat_qY_K_q8_K_T; + } else { + m.funcs[0] = mul_mat_qX_K_q8_K_T; + m.funcs[1] = mul_mat_qX_K_q8_K_T; + m.funcs[2] = mul_mat_qX_K_q8_K_T; + m.funcs[3] = mul_mat_qX_K_q8_K_T; + m.funcs[4] = mul_mat_qX_K_q8_K_T; + m.funcs[5] = mul_mat_qX_K_q8_K_T; + m.funcs[6] = mul_mat_qX_K_q8_K_T; + m.funcs[7] = mul_mat_qX_K_q8_K_T; + } +#endif + } +} + +struct QFBase { + #ifdef __AVX512F__ + constexpr static int k_step = 16; + using Data = __m512; + using Acc = __m512; + static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); } + static inline Data load(const float * x) { return _mm512_loadu_ps(x); } + static inline Data load(const ggml_bf16_t * x) { + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16)); + } + static inline Acc acc(Acc prev, const Data& y, const Data& x) { + return _mm512_fmadd_ps(y, x, prev); + } + static inline Acc acc_first(const Data& y, const Data& x) { + return _mm512_mul_ps(y, x); + } + static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); } + static inline float hsum(Acc acc) { + return _mm512_reduce_add_ps(acc); + } + template + static inline Data load4Floats(const Float * x) { + return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0); + } + static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { + acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc); + acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_r4_first(const Data * xv, const Data& yv) { + auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00)); + acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline __m128 hsum_r4(Acc acc) { + auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1)); + auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3)); + return _mm_add_ps(sum1, sum2); + } + #else + constexpr static int k_step = 8; + using Data = __m256; + using Acc = __m256; + static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); } + static inline Data load(const float * x) { return _mm256_loadu_ps(x); } + static inline Data load(const ggml_bf16_t * x) { + return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16)); + } + static inline Acc acc(Acc prev, const Data& y, const Data& x) { + return _mm256_fmadd_ps(y, x, prev); + } + static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); } + static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { + acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc); + acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_r4_first(const Data * xv, const Data& yv) { + auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00)); + acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_first(const Data& y, const Data& x) { + return _mm256_mul_ps(y, x); + } + static inline float hsum(Acc acc) { + return hsum_float_8(acc); + } + static inline __m128 hsum_r4(Acc acc) { + return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); + } + template + static inline Data load4Floats(const Float * x) { + return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0); + } + #endif + static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); } + static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); } + static inline __m128 load128(const ggml_bf16_t * x) { + return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16)); + } + }; + template struct QFT final : public QFBase { + constexpr static int nrc = nrc_in; + QFT(const DataInfo& info) { + for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy); + } + QFT(const char * cx, size_t bx) { + for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx); + } + IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); } + IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); } + IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const { + xv[0] = load1(ix+0, i); + xv[1] = load1(ix+1, i); + xv[2] = load1(ix+2, i); + xv[3] = load1(ix+3, i); + #ifdef __AVX512F__ + auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]); + auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]); + auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]); + auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]); + xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); + xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); + xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); + xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); + #else + auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]); + auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]); + auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]); + auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]); + xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); + xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); + xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); + xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); + #endif + } + const Float * y[nrc]; + }; + + + +template +IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) { + int nb = n/QFBase::k_step; + int nb4 = n/4; + Qy y(info); + Qx x(cx + ix0*bx, bx); + QFBase::Data xv[Qx::nrc]; + QFBase::Acc acc[Qx::nrc*Qy::nrc]; + auto yv = y.load1(0, 0); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load1(ix, 0); + acc[ix] = QFBase::acc_first(yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load1(iy, 0); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]); + } + for (int i = 1; i < nb; ++i) { + yv = y.load1(0, i); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load1(ix, i); + acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load1(iy, i); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); + } + } + for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) { + yv = y.load_tail(0, i); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load_tail(ix, i); + acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load_tail(iy, i); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); + } + } + for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix])); +} +// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done +// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in +// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now. +template +void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const char * cx = (const char *)vx; + // TBD if we want this + //if constexpr (nrc_y == 1) { + // constexpr int k_nx = 2; + // for (int ix = 0; ix < nrc_x/k_nx; ++ix) { + // mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, ix*k_nx, info); + // } + // if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) { + // int nx = nrc_x - lastx; + // switch (nx) { + // case 1: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // case 2: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // case 3: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // } + // //mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); + // } + // return; + //} +#ifdef __AVX512F__ + constexpr int k_nx = 5; +#else + constexpr int k_nx = nrc_y == 1 ? 4 : 2; +#endif + for (int ix = 0; ix < nrc_x/k_nx; ++ix) { + mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, ix*k_nx, info); + } + int last_x = k_nx*(nrc_x/k_nx); + if (last_x == nrc_x) return; + int nx = nrc_x - last_x; +#ifdef __AVX512F__ + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 4: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } +#else + if constexpr (nrc_y == 1) { + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } + } else { + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } + } +#endif +} + +template +void set_mul_mat_f(MulMat& mm) { + for (auto& f : mm.funcs) f = nullptr; + mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>; + mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>; + mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>; + mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>; + mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>; +#ifndef __AVX512F__ + mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>; +#endif +} + + + +/* +moonll +add typeb TO compare return not expected type of weight matrix +add IQ2XSS +add IQ1_S +add GGML_TYPE_IQ4_XS +*/ + +bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { + (void)Ny; + + auto expected_typeB = GGML_TYPE_Q8_K; + switch (typeA) { + case GGML_TYPE_Q2_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q3_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q4_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q5_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q6_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_IQ4_XS: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_IQ2_XXS: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q4_0: + assert (ne00 % QK4_0 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0; + break; + case GGML_TYPE_Q4_1: + assert (ne00 % QK4_1 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; + break; + case GGML_TYPE_Q5_0: + assert (ne00 % QK5_0 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0; + break; + case GGML_TYPE_Q5_1: + assert (ne00 % QK5_1 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; + break; + case GGML_TYPE_Q8_0: + assert (ne00 % QK8_0 == 0); +#ifdef HAVE_FANCY_SIMD + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; +#else + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0_X4; +#endif + break; + case GGML_TYPE_IQ1_S: + mm.funcs[0] = mul_mat_iq1_s_q8_K<1>; + mm.funcs[1] = mul_mat_iq1_s_q8_K<2>; + mm.funcs[2] = mul_mat_iq1_s_q8_K<3>; + mm.funcs[3] = mul_mat_iq1_s_q8_K<4>; + mm.funcs[4] = mul_mat_iq1_s_q8_K<5>; + mm.funcs[5] = mul_mat_iq1_s_q8_K<6>; + mm.funcs[6] = mul_mat_iq1_s_q8_K<7>; + mm.funcs[7] = mul_mat_iq1_s_q8_K<8>; + #ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_iq1_s_q8_K<16>; + #endif + // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + expected_typeB = GGML_TYPE_Q8_K; + break; + + default: + { + printf("case:%d",typeA); + return false; + } + + } + + + + return ggml_type(typeB) == expected_typeB; + +} + +} // namespace + +/* +iq1_s is not support for arm +*/ +#else // __aarch64__ +#include + +namespace { + +template struct Q8 { + + constexpr static int nrc_y = nrc; + + Q8(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); + } + + inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); } + inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); } + inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); } + inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); } + inline int16x8_t load_bsums8(int iy, int i) const { + auto q8s = vld1q_s16_x2(y[iy][i].bsums); + return vpaddq_s16(q8s.val[0], q8s.val[1]); + } + inline float scale(int iy, int i) const { return y[iy][i].d; } + + const block_q8 * y[nrc_y]; +}; + +template +struct BaseDequantizer { + BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {} + inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); } + const void * vx; + const block_q * x; + const size_t bx; + const int nrc; +}; + +struct Q4bits { + const uint8x16_t m4b = vdupq_n_u8(0xf); + uint8x16x4_t b1, b2; + inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const { + b.val[0] = vandq_u8(val[0], m4b); + b.val[2] = vshrq_n_u8(val[0], 4); + b.val[1] = vandq_u8(val[1], m4b); + b.val[3] = vshrq_n_u8(val[1], 4); + } + inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const { + b.val[0] = vandq_u8(val[0], m4b); + b.val[1] = vshrq_n_u8(val[0], 4); + b.val[2] = vandq_u8(val[1], m4b); + b.val[3] = vshrq_n_u8(val[1], 4); + } + inline void prepare(const uint8_t * qs) { + auto q4bits = vld1q_u8_x2(qs); + prepare4(b1, q4bits.val); + q4bits = vld1q_u8_x2(qs+32); + prepare4(b2, q4bits.val); + } + inline void prepare_v2(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + prepare4(b1, q4bits.val+0); + prepare4(b2, q4bits.val+2); + } + inline void prepare64(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + b1.val[0] = vandq_u8(q4bits.val[0], m4b); + b1.val[1] = vandq_u8(q4bits.val[1], m4b); + b1.val[2] = vandq_u8(q4bits.val[2], m4b); + b1.val[3] = vandq_u8(q4bits.val[3], m4b); + b2.val[0] = vshrq_n_u8(q4bits.val[0], 4); + b2.val[1] = vshrq_n_u8(q4bits.val[1], 4); + b2.val[2] = vshrq_n_u8(q4bits.val[2], 4); + b2.val[3] = vshrq_n_u8(q4bits.val[3], 4); + } + inline void prepare16(const uint8_t * qs) { + auto q4bits = vld1q_u8_x2(qs); + prepare4_16(b1, q4bits.val); + q4bits = vld1q_u8_x2(qs+32); + prepare4_16(b2, q4bits.val); + } + inline void prepare16_v2(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + prepare4_16(b1, q4bits.val+0); + prepare4_16(b2, q4bits.val+2); + } +}; + +struct Scales8 { + uint32_t utmp[4]; + const uint8_t * sc8 = (const uint8_t *)utmp; + template + inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) { + make_q4_scales(x.scales, utmp); + int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8)); + accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin)); + + uint8x8_t scales8 = vld1_u8(sc8); + uint16x8_t scales16 = vmovl_u8(scales8); + int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))}; + return scales; + } +}; + + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + return s8.process_scales_mins(x[i], q8, i, acc); + } + inline void prepare(int i, int j) { + if (nrc == 1) bits.prepare_v2(x[i].qs+64*j); + else bits.prepare(x[i].qs+64*j); + } + + Q4bits bits; + Scales8 s8; + + float d; +}; + + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d); + } + inline void prepare(int i, int j) { + + auto hbits = vld1q_u8_x2(x[i].qh + 32*j); + + bits.prepare64(x[i].ql+64*j); + bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb)); + bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb)); + bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb)); + bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb)); + + bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb)); + bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb)); + bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb)); + bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb)); + + } + + Q4bits bits; + + const uint8x16_t mhb = vdupq_n_u8(0x30); + + float d; +}; + +template +struct BlockQxK { + inline BlockQxK(const int maxn, const int maxk): maxn(maxn), maxk(maxk) { + values = (int8_t*)aligned_alloc(256, maxn * maxk * sizeof(int8_t)); + scales = (int*)aligned_alloc(256, maxn * maxk / SS * sizeof(int)); + ds = (float*)aligned_alloc(256, maxn * maxk / QK * sizeof(int)); + if constexpr (NeedSum) { + dmins = (float*)aligned_alloc(256, maxn * maxk / QK * sizeof(int)); + scalems = (int16_t*)aligned_alloc(256, maxn * maxk / SS * sizeof(int16_t)); + } + } + inline ~BlockQxK() { + free(values); + free(scales); + free(ds); + if constexpr (NeedSum) { + free(dmins); + free(scalems); + } + } + inline int FromDequantizer(const void * vx, size_t bx, int idx, int n_, int k_) { + n = n_; + k = k_; + bn = n / BS; + bk = k / QK; + + Dequantizer deq(vx, bx, 1); + for (int i = 0; i < n; i += BS) { + for (int j = 0; j < BS; j ++) { + deq.new_row(j + i + idx); + for (int x = 0; x < bk; x ++) { + { + int8x16_t base = NeedSum ? vdupq_n_s8(0) : vdupq_n_s8(32); + int32_t *dst = (int32_t*)(values + i*k + j*4 + x*QK*BS); + deq.prepare(x, 0); + int8x16_t v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base); + int8x16_t v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base); + int8x16_t v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base); + int8x16_t v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base); + *(dst + (0 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0); + *(dst + (1 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1); + *(dst + (2 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2); + *(dst + (3 + 0*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3); + *(dst + (0 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0); + *(dst + (1 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1); + *(dst + (2 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2); + *(dst + (3 + 1*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3); + *(dst + (0 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0); + *(dst + (1 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1); + *(dst + (2 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2); + *(dst + (3 + 2*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3); + *(dst + (0 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0); + *(dst + (1 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1); + *(dst + (2 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2); + *(dst + (3 + 3*4 + 0*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3); + v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base); + v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base); + v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base); + v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base); + *(dst + (0 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0); + *(dst + (1 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1); + *(dst + (2 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2); + *(dst + (3 + 0*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3); + *(dst + (0 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0); + *(dst + (1 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1); + *(dst + (2 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2); + *(dst + (3 + 1*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3); + *(dst + (0 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0); + *(dst + (1 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1); + *(dst + (2 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2); + *(dst + (3 + 2*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3); + *(dst + (0 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0); + *(dst + (1 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1); + *(dst + (2 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2); + *(dst + (3 + 3*4 + 1*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3); + deq.prepare(x, 1); + v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[0]), base); + v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[1]), base); + v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[2]), base); + v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b1.val[3]), base); + *(dst + (0 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0); + *(dst + (1 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1); + *(dst + (2 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2); + *(dst + (3 + 0*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3); + *(dst + (0 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0); + *(dst + (1 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1); + *(dst + (2 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2); + *(dst + (3 + 1*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3); + *(dst + (0 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0); + *(dst + (1 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1); + *(dst + (2 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2); + *(dst + (3 + 2*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3); + *(dst + (0 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0); + *(dst + (1 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1); + *(dst + (2 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2); + *(dst + (3 + 3*4 + 2*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3); + v0 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[0]), base); + v1 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[1]), base); + v2 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[2]), base); + v3 = vsubq_s8(vreinterpretq_s8_u8(deq.bits.b2.val[3]), base); + *(dst + (0 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 0); + *(dst + (1 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 1); + *(dst + (2 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 2); + *(dst + (3 + 0*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v0), 3); + *(dst + (0 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 0); + *(dst + (1 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 1); + *(dst + (2 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 2); + *(dst + (3 + 1*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v1), 3); + *(dst + (0 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 0); + *(dst + (1 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 1); + *(dst + (2 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 2); + *(dst + (3 + 2*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v2), 3); + *(dst + (0 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 0); + *(dst + (1 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 1); + *(dst + (2 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 2); + *(dst + (3 + 3*4 + 3*16)*BS) = vgetq_lane_s32(vreinterpretq_s32_s8(v3), 3); + } + if constexpr (std::is_same_v) + { + int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS); + int8x16_t ss = vld1q_s8(deq.x[x].scales); + int16x8_t s16_0 = vmovl_s8(vget_low_s8(ss)); + int16x8_t s16_1 = vmovl_s8(vget_high_s8(ss)); + int32x4_t s32_0 = vmovl_s16(vget_low_s16(s16_0)); + int32x4_t s32_1 = vmovl_s16(vget_high_s16(s16_0)); + int32x4_t s32_2 = vmovl_s16(vget_low_s16(s16_1)); + int32x4_t s32_3 = vmovl_s16(vget_high_s16(s16_1)); + *(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0); + *(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1); + *(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2); + *(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3); + *(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0); + *(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1); + *(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2); + *(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3); + *(dst + (0+2*4)*BS) = vgetq_lane_s32(s32_2, 0); + *(dst + (1+2*4)*BS) = vgetq_lane_s32(s32_2, 1); + *(dst + (2+2*4)*BS) = vgetq_lane_s32(s32_2, 2); + *(dst + (3+2*4)*BS) = vgetq_lane_s32(s32_2, 3); + *(dst + (0+3*4)*BS) = vgetq_lane_s32(s32_3, 0); + *(dst + (1+3*4)*BS) = vgetq_lane_s32(s32_3, 1); + *(dst + (2+3*4)*BS) = vgetq_lane_s32(s32_3, 2); + *(dst + (3+3*4)*BS) = vgetq_lane_s32(s32_3, 3); + } + if constexpr (std::is_same_v) + { + int32_t *dst = (int32_t*)(scales + i*(k/SS) + j + x*QK/SS*BS); + int16_t *dst2 = (int16_t*)(scalems + i*(k/SS) + j + x*QK/SS*BS); + uint32_t utmp[4]; + const uint8_t * sc8 = (const uint8_t *)utmp; + make_q4_scales(deq.x[x].scales, utmp); + int8x16_t ss = vld1q_s8((const int8_t *)sc8); + int16x8_t scale = vmovl_s8(vget_low_s8(ss)); + int16x8_t scale_min = vmovl_high_s8(ss); + int32x4_t s32_0 = vmovl_s16(vget_low_s16(scale)); + int32x4_t s32_1 = vmovl_s16(vget_high_s16(scale)); + *(dst + (0+0*4)*BS) = vgetq_lane_s32(s32_0, 0); + *(dst + (1+0*4)*BS) = vgetq_lane_s32(s32_0, 1); + *(dst + (2+0*4)*BS) = vgetq_lane_s32(s32_0, 2); + *(dst + (3+0*4)*BS) = vgetq_lane_s32(s32_0, 3); + *(dst + (0+1*4)*BS) = vgetq_lane_s32(s32_1, 0); + *(dst + (1+1*4)*BS) = vgetq_lane_s32(s32_1, 1); + *(dst + (2+1*4)*BS) = vgetq_lane_s32(s32_1, 2); + *(dst + (3+1*4)*BS) = vgetq_lane_s32(s32_1, 3); + *(dst2 + 0*BS) = vgetq_lane_s16(scale_min, 0); + *(dst2 + 1*BS) = vgetq_lane_s16(scale_min, 1); + *(dst2 + 2*BS) = vgetq_lane_s16(scale_min, 2); + *(dst2 + 3*BS) = vgetq_lane_s16(scale_min, 3); + *(dst2 + 4*BS) = vgetq_lane_s16(scale_min, 4); + *(dst2 + 5*BS) = vgetq_lane_s16(scale_min, 5); + *(dst2 + 6*BS) = vgetq_lane_s16(scale_min, 6); + *(dst2 + 7*BS) = vgetq_lane_s16(scale_min, 7); + } + { + float *dst = ds + i*bk + j + x*BS; + *dst = GGML_FP16_TO_FP32(deq.x[x].d); + } + if constexpr (std::is_same_v) + { + float *dst = dmins + i*bk + j + x*BS; + *dst = - GGML_FP16_TO_FP32(deq.x[x].dmin); + } + } + } + } + return 0; + } + + int8_t *values; // [bn][k/4][BS][4] + int *scales; // [bn][k/SS][BS] + float *ds; // [bn][bk][BS] + float *dmins; // [bn][bk][BS] + int16_t *scalems; // [bn][k/SS][BS] + + static constexpr int BS = 8; + static constexpr int QK = 256; + static constexpr int SS = std::is_same_v ? 16 : 32; + static constexpr int NeedSum = std::is_same_v ? 0 : 1; + const int maxn; + const int maxk; + int n; + int k; + int bn; + int bk; +}; + +template +IQK_NOINLINE void matmul_v2_kernel(const Dequantizer *a, const block_q8_K *y[BN], const DataInfo &info, int idx, int idy) { + constexpr int BS = a->BS; + constexpr int QK = a->QK; + constexpr int SS = a->SS; + for (int s = 0; s < a->n; s += BS) { + float32x4_t cc[BN][BS/4]; + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cc[i][j] = vdupq_n_f32(0); + } + } + const int8_t *a_ptr = a->values + s*a->k; + const int8_t *b_ptr[BN]; + for (int k = 0; k < a->bk; k ++) { + for (int i = 0; i < BN; i ++) { + b_ptr[i] = y[i][k].qs; + } + int32x4_t cci[BN][BS/4]; + if constexpr (BN == 4 && SS == 16) { + int64_t length = QK/SS; + auto ap = a_ptr; + auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS; + // asm volatile ( + asm volatile ( + " eor %[c00].16b, %[c00].16b, %[c00].16b \n" + " eor %[c10].16b, %[c10].16b, %[c10].16b \n" + " eor %[c20].16b, %[c20].16b, %[c20].16b \n" + " eor %[c30].16b, %[c30].16b, %[c30].16b \n" + " eor %[c01].16b, %[c01].16b, %[c01].16b \n" + " eor %[c11].16b, %[c11].16b, %[c11].16b \n" + " eor %[c21].16b, %[c21].16b, %[c21].16b \n" + " eor %[c31].16b, %[c31].16b, %[c31].16b \n" + " loop_%=: \n" + " subs %[len], %[len], #1 \n" + " ld1 {v12.16b}, [%[bp0]], #16 \n" + " ld1 {v13.16b}, [%[bp1]], #16 \n" + " ld1 {v14.16b}, [%[bp2]], #16 \n" + " ld1 {v15.16b}, [%[bp3]], #16 \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " eor v0.16b, v0.16b, v0.16b \n" + " eor v1.16b, v1.16b, v1.16b \n" + " eor v2.16b, v2.16b, v2.16b \n" + " eor v3.16b, v3.16b, v3.16b \n" + " eor v4.16b, v4.16b, v4.16b \n" + " eor v5.16b, v5.16b, v5.16b \n" + " eor v6.16b, v6.16b, v6.16b \n" + " eor v7.16b, v7.16b, v7.16b \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[0] \n" + " sdot v1.4s, v8.16b, v13.4b[0] \n" + " sdot v2.4s, v8.16b, v14.4b[0] \n" + " sdot v3.4s, v8.16b, v15.4b[0] \n" + " sdot v4.4s, v9.16b, v12.4b[0] \n" + " sdot v5.4s, v9.16b, v13.4b[0] \n" + " sdot v6.4s, v9.16b, v14.4b[0] \n" + " sdot v7.4s, v9.16b, v15.4b[0] \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[1] \n" + " sdot v1.4s, v10.16b, v13.4b[1] \n" + " sdot v2.4s, v10.16b, v14.4b[1] \n" + " sdot v3.4s, v10.16b, v15.4b[1] \n" + " sdot v4.4s, v11.16b, v12.4b[1] \n" + " sdot v5.4s, v11.16b, v13.4b[1] \n" + " sdot v6.4s, v11.16b, v14.4b[1] \n" + " sdot v7.4s, v11.16b, v15.4b[1] \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[2] \n" + " sdot v1.4s, v8.16b, v13.4b[2] \n" + " sdot v2.4s, v8.16b, v14.4b[2] \n" + " sdot v3.4s, v8.16b, v15.4b[2] \n" + " sdot v4.4s, v9.16b, v12.4b[2] \n" + " sdot v5.4s, v9.16b, v13.4b[2] \n" + " sdot v6.4s, v9.16b, v14.4b[2] \n" + " sdot v7.4s, v9.16b, v15.4b[2] \n" + " ld1 {v8.4s}, [%[sp]], #16 \n" + " ld1 {v9.4s}, [%[sp]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[3] \n" + " sdot v1.4s, v10.16b, v13.4b[3] \n" + " sdot v2.4s, v10.16b, v14.4b[3] \n" + " sdot v3.4s, v10.16b, v15.4b[3] \n" + " sdot v4.4s, v11.16b, v12.4b[3] \n" + " sdot v5.4s, v11.16b, v13.4b[3] \n" + " sdot v6.4s, v11.16b, v14.4b[3] \n" + " sdot v7.4s, v11.16b, v15.4b[3] \n" + " mla %[c00].4s, v0.4s, v8.4s \n" + " mla %[c10].4s, v1.4s, v8.4s \n" + " mla %[c20].4s, v2.4s, v8.4s \n" + " mla %[c30].4s, v3.4s, v8.4s \n" + " mla %[c01].4s, v4.4s, v9.4s \n" + " mla %[c11].4s, v5.4s, v9.4s \n" + " mla %[c21].4s, v6.4s, v9.4s \n" + " mla %[c31].4s, v7.4s, v9.4s \n" + " bne loop_%= \n" + " exit_%=:\n" + : [len] "+r" (length) + , [ap] "+r" (ap) + , [bp0] "+r" (b_ptr[0]) + , [bp1] "+r" (b_ptr[1]) + , [bp2] "+r" (b_ptr[2]) + , [bp3] "+r" (b_ptr[3]) + , [sp] "+r" (sp) + , [c00] "+w" (cci[0][0]) + , [c10] "+w" (cci[1][0]) + , [c20] "+w" (cci[2][0]) + , [c30] "+w" (cci[3][0]) + , [c01] "+w" (cci[0][1]) + , [c11] "+w" (cci[1][1]) + , [c21] "+w" (cci[2][1]) + , [c31] "+w" (cci[3][1]) + : + : "v0", "v1", "v2", "v3" + , "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11" + , "v12", "v13", "v14", "v15" + , "memory", "cc" + ); + a_ptr += BS * QK; + } else if (BN == 4 && SS == 32) { + int64_t length = QK/SS; + auto ap = a_ptr; + auto sp = a->scales + s*a->k/SS + (k*QK/SS)*BS; + // asm volatile ( + asm volatile ( + " eor %[c00].16b, %[c00].16b, %[c00].16b \n" + " eor %[c10].16b, %[c10].16b, %[c10].16b \n" + " eor %[c20].16b, %[c20].16b, %[c20].16b \n" + " eor %[c30].16b, %[c30].16b, %[c30].16b \n" + " eor %[c01].16b, %[c01].16b, %[c01].16b \n" + " eor %[c11].16b, %[c11].16b, %[c11].16b \n" + " eor %[c21].16b, %[c21].16b, %[c21].16b \n" + " eor %[c31].16b, %[c31].16b, %[c31].16b \n" + " loop_%=: \n" + " subs %[len], %[len], #1 \n" + " ld1 {v12.16b}, [%[bp0]], #16 \n" + " ld1 {v13.16b}, [%[bp1]], #16 \n" + " ld1 {v14.16b}, [%[bp2]], #16 \n" + " ld1 {v15.16b}, [%[bp3]], #16 \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " eor v0.16b, v0.16b, v0.16b \n" + " eor v1.16b, v1.16b, v1.16b \n" + " eor v2.16b, v2.16b, v2.16b \n" + " eor v3.16b, v3.16b, v3.16b \n" + " eor v4.16b, v4.16b, v4.16b \n" + " eor v5.16b, v5.16b, v5.16b \n" + " eor v6.16b, v6.16b, v6.16b \n" + " eor v7.16b, v7.16b, v7.16b \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[0] \n" + " sdot v1.4s, v8.16b, v13.4b[0] \n" + " sdot v2.4s, v8.16b, v14.4b[0] \n" + " sdot v3.4s, v8.16b, v15.4b[0] \n" + " sdot v4.4s, v9.16b, v12.4b[0] \n" + " sdot v5.4s, v9.16b, v13.4b[0] \n" + " sdot v6.4s, v9.16b, v14.4b[0] \n" + " sdot v7.4s, v9.16b, v15.4b[0] \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[1] \n" + " sdot v1.4s, v10.16b, v13.4b[1] \n" + " sdot v2.4s, v10.16b, v14.4b[1] \n" + " sdot v3.4s, v10.16b, v15.4b[1] \n" + " sdot v4.4s, v11.16b, v12.4b[1] \n" + " sdot v5.4s, v11.16b, v13.4b[1] \n" + " sdot v6.4s, v11.16b, v14.4b[1] \n" + " sdot v7.4s, v11.16b, v15.4b[1] \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[2] \n" + " sdot v1.4s, v8.16b, v13.4b[2] \n" + " sdot v2.4s, v8.16b, v14.4b[2] \n" + " sdot v3.4s, v8.16b, v15.4b[2] \n" + " sdot v4.4s, v9.16b, v12.4b[2] \n" + " sdot v5.4s, v9.16b, v13.4b[2] \n" + " sdot v6.4s, v9.16b, v14.4b[2] \n" + " sdot v7.4s, v9.16b, v15.4b[2] \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[3] \n" + " sdot v1.4s, v10.16b, v13.4b[3] \n" + " sdot v2.4s, v10.16b, v14.4b[3] \n" + " sdot v3.4s, v10.16b, v15.4b[3] \n" + " sdot v4.4s, v11.16b, v12.4b[3] \n" + " sdot v5.4s, v11.16b, v13.4b[3] \n" + " sdot v6.4s, v11.16b, v14.4b[3] \n" + " sdot v7.4s, v11.16b, v15.4b[3] \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " ld1 {v12.16b}, [%[bp0]], #16 \n" + " ld1 {v13.16b}, [%[bp1]], #16 \n" + " ld1 {v14.16b}, [%[bp2]], #16 \n" + " ld1 {v15.16b}, [%[bp3]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[0] \n" + " sdot v1.4s, v8.16b, v13.4b[0] \n" + " sdot v2.4s, v8.16b, v14.4b[0] \n" + " sdot v3.4s, v8.16b, v15.4b[0] \n" + " sdot v4.4s, v9.16b, v12.4b[0] \n" + " sdot v5.4s, v9.16b, v13.4b[0] \n" + " sdot v6.4s, v9.16b, v14.4b[0] \n" + " sdot v7.4s, v9.16b, v15.4b[0] \n" + " prfm pldl1strm, [%[ap], #256] \n" + " ld1 {v8.16b}, [%[ap]], #16 \n" + " ld1 {v9.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[1] \n" + " sdot v1.4s, v10.16b, v13.4b[1] \n" + " sdot v2.4s, v10.16b, v14.4b[1] \n" + " sdot v3.4s, v10.16b, v15.4b[1] \n" + " sdot v4.4s, v11.16b, v12.4b[1] \n" + " sdot v5.4s, v11.16b, v13.4b[1] \n" + " sdot v6.4s, v11.16b, v14.4b[1] \n" + " sdot v7.4s, v11.16b, v15.4b[1] \n" + " ld1 {v10.16b}, [%[ap]], #16 \n" + " ld1 {v11.16b}, [%[ap]], #16 \n" + " sdot v0.4s, v8.16b, v12.4b[2] \n" + " sdot v1.4s, v8.16b, v13.4b[2] \n" + " sdot v2.4s, v8.16b, v14.4b[2] \n" + " sdot v3.4s, v8.16b, v15.4b[2] \n" + " sdot v4.4s, v9.16b, v12.4b[2] \n" + " sdot v5.4s, v9.16b, v13.4b[2] \n" + " sdot v6.4s, v9.16b, v14.4b[2] \n" + " sdot v7.4s, v9.16b, v15.4b[2] \n" + " ld1 {v8.4s}, [%[sp]], #16 \n" + " ld1 {v9.4s}, [%[sp]], #16 \n" + " sdot v0.4s, v10.16b, v12.4b[3] \n" + " sdot v1.4s, v10.16b, v13.4b[3] \n" + " sdot v2.4s, v10.16b, v14.4b[3] \n" + " sdot v3.4s, v10.16b, v15.4b[3] \n" + " sdot v4.4s, v11.16b, v12.4b[3] \n" + " sdot v5.4s, v11.16b, v13.4b[3] \n" + " sdot v6.4s, v11.16b, v14.4b[3] \n" + " sdot v7.4s, v11.16b, v15.4b[3] \n" + " mla %[c00].4s, v0.4s, v8.4s \n" + " mla %[c10].4s, v1.4s, v8.4s \n" + " mla %[c20].4s, v2.4s, v8.4s \n" + " mla %[c30].4s, v3.4s, v8.4s \n" + " mla %[c01].4s, v4.4s, v9.4s \n" + " mla %[c11].4s, v5.4s, v9.4s \n" + " mla %[c21].4s, v6.4s, v9.4s \n" + " mla %[c31].4s, v7.4s, v9.4s \n" + " bne loop_%= \n" + " exit_%=:\n" + : [len] "+r" (length) + , [ap] "+r" (ap) + , [bp0] "+r" (b_ptr[0]) + , [bp1] "+r" (b_ptr[1]) + , [bp2] "+r" (b_ptr[2]) + , [bp3] "+r" (b_ptr[3]) + , [sp] "+r" (sp) + , [c00] "+w" (cci[0][0]) + , [c10] "+w" (cci[1][0]) + , [c20] "+w" (cci[2][0]) + , [c30] "+w" (cci[3][0]) + , [c01] "+w" (cci[0][1]) + , [c11] "+w" (cci[1][1]) + , [c21] "+w" (cci[2][1]) + , [c31] "+w" (cci[3][1]) + : + : "v0", "v1", "v2", "v3" + , "v4", "v5", "v6", "v7" + , "v8", "v9", "v10", "v11" + , "v12", "v13", "v14", "v15" + , "memory", "cc" + ); + a_ptr += BS * QK; + } else + { + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cci[i][j] = vdupq_n_s32(0); + } + } + for (int k0 = 0; k0 < QK/SS; k0 ++) { + int32x4_t ccv[BN][BS/4]; + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + ccv[i][j] = vdupq_n_s32(0); + } + } + #pragma unroll + for (int k2 = 0; k2 < SS; k2 += 16) { + const int OFFSET = 256; + __builtin_prefetch((a_ptr + OFFSET + 0*64), 0, 0); + __builtin_prefetch((a_ptr + OFFSET + 1*64), 0, 0); + + int8x16_t bb[BN]; + int8x16_t aa[BS/4]; + for (int i = 0; i < BN; i ++) { + bb[i] = vld1q_s8(b_ptr[i]); b_ptr[i] += 16; + } + for (int k1 = 0; k1 < 4; k1 ++) { + for (int i = 0; i < BS/4; i ++) { + aa[i] = vld1q_s8(a_ptr); a_ptr += 16; + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + ccv[i][j] = vdotq_laneq_s32(ccv[i][j], aa[j], bb[i], k1); + } + } + } + } + int32x4_t scal[BS/4]; + for (int i = 0; i < BS/4; i ++) { + scal[i] = vld1q_s32(a->scales + s*a->k/SS + (k*QK/SS+k0)*BS + i*4); + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cci[i][j] = vmlaq_s32(cci[i][j], ccv[i][j], scal[j]); + } + } + } + } + float32x4_t scalf[BS/4]; + for (int i = 0; i < BS/4; i ++) { + scalf[i] = vld1q_f32(a->ds + s*a->bk + k*BS + i*4); + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d)); + } + } + } + if constexpr (a->NeedSum) { + const int16_t *a_ptr = a->scalems + s*a->k/SS; + const int16_t *b_ptr[BN]; + for (int k = 0; k < a->bk; k ++) { + for (int i = 0; i < BN; i ++) { + b_ptr[i] = y[i][k].bsums; + } + int32x4_t cci[BN][BS/4]; + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cci[i][j] = vdupq_n_s32(0); + } + } + for (int k0 = 0; k0 < QK/SS/4; k0 ++) { + int16x8_t bb[BN]; + int16x8_t aa[BS/8]; + for (int i = 0; i < BN; i ++) { + bb[i] = vld1q_s16(b_ptr[i]); b_ptr[i] += 8; + } + for (int k1 = 0; k1 < 4; k1 ++) { + for (int i = 0; i < BS/8; i ++) { + aa[i] = vld1q_s16(a_ptr); a_ptr += 8; + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/8; j ++) { + cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+0); + cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+0); + cci[i][2*j+0] = vmlal_laneq_s16(cci[i][2*j+0], vget_low_s16(aa[j]), bb[i], 2*k1+1); + cci[i][2*j+1] = vmlal_high_laneq_s16(cci[i][2*j+1], aa[j], bb[i], 2*k1+1); + } + } + } + } + float32x4_t scalf[BS/4]; + for (int i = 0; i < BS/4; i ++) { + scalf[i] = vld1q_f32(a->dmins + s*a->bk + k*BS + i*4); + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + cc[i][j] = vfmaq_f32(cc[i][j], vcvtq_f32_s32(cci[i][j]), vmulq_n_f32(scalf[j], y[i][k].d)); + } + } + } + } + for (int i = 0; i < BN; i ++) { + for (int j = 0; j < BS/4; j ++) { + vst1q_f32(info.ptr(j*4+s+idx, i), cc[i][j]); + } + } + } + return; +} + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_T_v2(int m, int n, int k, const void * vx, size_t bx, const DataInfo& info) { + constexpr int m_step = 64; + constexpr int n_step = 4; + assert(m%m_step == 0); + int n2 = n - (n%n_step); + int left = n%n_step; + BlockQxK xx(m_step, k); + for (int i = 0; i < m; i += m_step) { + auto this_info = info; + int bm = (m - i) < m_step ? (m - i) : m_step; + xx.FromDequantizer(vx, bx, i, bm, k); + for (int j = 0; j < n2; j += n_step) { + Q8 q8(this_info); + matmul_v2_kernel, n_step>(&xx, q8.y, this_info, i, j); + this_info.cur_y += n_step; + } + if (left) { + switch (left) { + case 1: + { + Q8<1, block_q8_K> q8(this_info); + matmul_v2_kernel, 1>(&xx, q8.y, this_info, i, n2); + this_info.cur_y += 1; + break; + } + case 2: + { + Q8<2, block_q8_K> q8(this_info); + matmul_v2_kernel, 2>(&xx, q8.y, this_info, i, n2); + this_info.cur_y += 2; + break; + } + case 3: + { + Q8<3, block_q8_K> q8(this_info); + matmul_v2_kernel, 3>(&xx, q8.y, this_info, i, n2); + this_info.cur_y += 3; + break; + } + } + } + } + return; +} + +template +IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, + const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) { + auto mzero = vdupq_n_s32(0); + const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val; + const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val; + + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1 + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2 + auto p12 = vpaddq_s32(p1, p2); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3 + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4 + auto p34 = vpaddq_s32(p3, p4); + + auto pall = vpaddq_s32(p12, p34); + sumi = vmlaq_s32(sumi, scales.val[j], pall); +} + +template +IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8, + const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { + auto mzero = vdupq_n_s32(0); + + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1 + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2 + auto p12 = vpaddq_s32(p1, p2); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3 + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4 + auto p34 = vpaddq_s32(p3, p4); + + auto pall = vpaddq_s32(p12, p34); + sumi = vmlaq_s32(sumi, scales, pall); +} + +template +IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, + const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { + + auto mzero = vdupq_n_s32(0); + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1, + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4, + auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3 + sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5, + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7, + auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7 + sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34); +} + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx, nrc_y); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb; ++i) { + + int32x4_t sumi[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); + + if constexpr (Dequantizer::num_blocks() == 8) { + auto scales = deq.new_block(i); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else if constexpr (Dequantizer::num_blocks() == 16) { + auto scales = deq.new_block(i); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else { + GGML_ASSERT(false); + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); + } + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +template +inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8s = q8.load_bsums8(iy, i); + int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s)); + int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s)); + float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2)); + acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); + } +} + +template +inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8s = q8.load_bsums(iy, i); + int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0])); + int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0])); + int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1])); + int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1])); + float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4))); + acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); + } +} + +struct Q2bits { + const uint8x16_t m4b = vdupq_n_u8(0x03); + uint8x16x4_t b1, b2; + inline void prepare(const uint8_t * qs) { + auto q2bits = vld1q_u8_x2(qs); + b1.val[0] = vandq_u8(q2bits.val[0], m4b); + b1.val[1] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b1.val[2] = vandq_u8(q2bits.val[0], m4b); + b1.val[3] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b2.val[0] = vandq_u8(q2bits.val[0], m4b); + b2.val[1] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b2.val[2] = vandq_u8(q2bits.val[0], m4b); + b2.val[3] = vandq_u8(q2bits.val[1], m4b); + } +}; + +struct HighBit5 { + const uint8x16_t mhb = vdupq_n_u8(0x10); + uint8x16x2_t bits; + inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { + b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb)); + b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb)); + b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb)); + b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb)); + + b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); + b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); + b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); + b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); + + if (do_shift) { + bits.val[0] = vshrq_n_u8(bits.val[0], 4); + bits.val[1] = vshrq_n_u8(bits.val[1], 4); + } + } +}; + +struct HighBit3 { + const uint8x16_t mhb = vdupq_n_u8(0x04); + uint8x16x2_t bits; + inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { + b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); + b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); + b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); + b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); + + b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb)); + b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb)); + b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb)); + b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb)); + + if (do_shift) { + bits.val[0] = vshrq_n_u8(bits.val[0], 4); + bits.val[1] = vshrq_n_u8(bits.val[1], 4); + } + } +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + h.bits = vld1q_u8_x2(x[i].qh); + return s8.process_scales_mins(x[i], q8, i, acc); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+64*j); + h.apply(bits.b1, bits.b2, j == 0); + } + + Q4bits bits; + HighBit5 h; + Scales8 s8; + + uint8x16x2_t hbits; + + float d; +}; + +inline int32x4x4_t make_wider(const int16x8x2_t& scales16) { + int32x4x4_t scales = { + vmovl_s16(vget_low_s16 (scales16.val[0])), + vmovl_s16(vget_high_s16(scales16.val[0])), + vmovl_s16(vget_low_s16 (scales16.val[1])), + vmovl_s16(vget_high_s16(scales16.val[1])), + }; + return scales; +} + +template +inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) { + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(scales8)); + scales16.val[1] = vmovl_s8(vget_high_s8(scales8)); + accum_mins_16(scales16, q8, acc, i, c); + return make_wider(scales16); +} + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + h.bits = vld1q_u8_x2(x[i].hmask); + const uint16_t * sc16 = (const uint16_t *)x[i].scales; + uint32_t aux0 = sc16[0] | (sc16[1] << 16); + uint32_t aux1 = sc16[2] | (sc16[3] << 16); + uint32_t aux2 = sc16[4] | (sc16[5] << 16); + aux32[0] = (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030); + aux32[1] = (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030); + aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030); + aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030); + return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d); + } + + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+32*j); + h.apply(bits.b1, bits.b2, j == 0); + } + + uint32_t aux32[4]; + + Q2bits bits; + + HighBit3 h; + + float d; +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return true; } + + template + inline void process_scales(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + auto scales_and_mins = vld1q_u8(x[i].scales); + auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4)); + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(mins8)); + scales16.val[1] = vmovl_s8(vget_high_s8(mins8)); + accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin)); + + scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf)); + } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + process_scales(i, q8, acc); + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8))); + scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8))); + return make_wider(scales16); + } + + template + inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) { + auto m1 = vdupq_n_u8(1); + auto shuffle = vdupq_n_u8(8*j); + bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]), + vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]); + + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]), + vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]), + vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]); + + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]), + vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]); + } + } + + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+32*j); + } + + uint32_t aux32[4]; + + uint8x16_t scales8; + + Q2bits bits; + + float d; +}; + +IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d6k( + float32x4_t &acc, + const uint8_t *x_ql, // [128] 4bit + const uint8_t *x_qh, // [64] 2bit + const int8_t *x_scale, // [16] 8bit + float x_d, + const int8_t *y_qs, // [256] 8bit + const int16_t *y_bsums, // [16] 16bit + float y_d) +{ + float c0 = x_d * y_d; + float c1 = -32.0f * c0; + const int OFFSET = 1024; + __builtin_prefetch((x_ql + OFFSET + 0*64), 0, 0); + __builtin_prefetch((x_ql + OFFSET + 1*64), 0, 0); + __builtin_prefetch((x_ql + OFFSET + 2*64), 0, 0); + + int16x8_t scale16_0, scale16_1; + { + int8x16_t tmp = vld1q_s8(x_scale); + scale16_0 = vmovl_s8(vget_low_s8(tmp)); + scale16_1 = vmovl_high_s8(tmp); + } + { + int16x8_t q8s0 = vld1q_s16(y_bsums + 0); + int16x8_t q8s1 = vld1q_s16(y_bsums + 8); + int32x4_t b0 = vmull_s16(vget_low_s16(scale16_0), vget_low_s16(q8s0)); + b0 = vmlal_high_s16(b0, scale16_0, q8s0); + b0 = vmlal_s16(b0, vget_low_s16(scale16_1), vget_low_s16(q8s1)); + b0 = vmlal_high_s16(b0, scale16_1, q8s1); + acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1); + } + uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7; + int32x4_t sumi = vdupq_n_s32(0); + { + const uint8x16_t m0 = vdupq_n_u8(0x3f); + const uint8x16_t m1 = vdupq_n_u8(0x30); + const uint8x16_t m2 = vdupq_n_u8(0x0f); + x0 = vld1q_u8(x_ql + 0*16 + 0*64); + x1 = vld1q_u8(x_ql + 1*16 + 0*64); + x2 = vld1q_u8(x_ql + 2*16 + 0*64); + x3 = vld1q_u8(x_ql + 3*16 + 0*64); + uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 0*32); + uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 0*32); + x4 = vandq_u8(hbits0, m0); + x4 = vsriq_n_u8(x4, x0, 4); + x5 = vandq_u8(hbits1, m0); + x5 = vsriq_n_u8(x5, x1, 4); + x6 = vshrq_n_u8(hbits0, 2); + x6 = vsriq_n_u8(x6, x2, 4); + x7 = vshrq_n_u8(hbits1, 2); + x7 = vsriq_n_u8(x7, x3, 4); + x0 = vsliq_n_u8(x0, hbits0, 4); + x0 = vandq_u8(x0, m0); + x1 = vsliq_n_u8(x1, hbits1, 4); + x1 = vandq_u8(x1, m0); + hbits0 = vshlq_n_u8(hbits0, 2); + hbits0 = vandq_u8(hbits0, m1); + x2 = vandq_u8(x2, m2); + x2 = vorrq_u8(x2, hbits0); + hbits1 = vshlq_n_u8(hbits1, 2); + hbits1 = vandq_u8(hbits1, m1); + x3 = vandq_u8(x3, m2); + x3 = vorrq_u8(x3, hbits1); + } + { + int8x16_t base = vdupq_n_s8(32); + int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128); + int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128); + int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128); + int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128); + int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128); + int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128); + int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128); + int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128); + int32x4_t p00 = vdupq_n_s32(0); + int32x4_t p01 = vdupq_n_s32(0); + int32x4_t p10 = vdupq_n_s32(0); + int32x4_t p11 = vdupq_n_s32(0); + int32x4_t p20 = vdupq_n_s32(0); + int32x4_t p21 = vdupq_n_s32(0); + int32x4_t p30 = vdupq_n_s32(0); + int32x4_t p31 = vdupq_n_s32(0); + p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0); + p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1); + p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2); + p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3); + p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4); + p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5); + p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6); + p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7); + p00 = vpaddq_s32(p00, p01); + p10 = vpaddq_s32(p10, p11); + p20 = vpaddq_s32(p20, p21); + p30 = vpaddq_s32(p30, p31); + p00 = vpaddq_s32(p00, p10); + p20 = vpaddq_s32(p20, p30); + sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_0)), p00); + sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_0), p20); + } + { + const uint8x16_t m0 = vdupq_n_u8(0x3f); + const uint8x16_t m1 = vdupq_n_u8(0x30); + const uint8x16_t m2 = vdupq_n_u8(0x0f); + x0 = vld1q_u8(x_ql + 0*16 + 1*64); + x1 = vld1q_u8(x_ql + 1*16 + 1*64); + x2 = vld1q_u8(x_ql + 2*16 + 1*64); + x3 = vld1q_u8(x_ql + 3*16 + 1*64); + uint8x16_t hbits0 = vld1q_u8(x_qh + 0*16 + 1*32); + uint8x16_t hbits1 = vld1q_u8(x_qh + 1*16 + 1*32); + x4 = vandq_u8(hbits0, m0); + x4 = vsriq_n_u8(x4, x0, 4); + x5 = vandq_u8(hbits1, m0); + x5 = vsriq_n_u8(x5, x1, 4); + x6 = vshrq_n_u8(hbits0, 2); + x6 = vsriq_n_u8(x6, x2, 4); + x7 = vshrq_n_u8(hbits1, 2); + x7 = vsriq_n_u8(x7, x3, 4); + x0 = vsliq_n_u8(x0, hbits0, 4); + x0 = vandq_u8(x0, m0); + x1 = vsliq_n_u8(x1, hbits1, 4); + x1 = vandq_u8(x1, m0); + hbits0 = vshlq_n_u8(hbits0, 2); + hbits0 = vandq_u8(hbits0, m1); + x2 = vandq_u8(x2, m2); + x2 = vorrq_u8(x2, hbits0); + hbits1 = vshlq_n_u8(hbits1, 2); + hbits1 = vandq_u8(hbits1, m1); + x3 = vandq_u8(x3, m2); + x3 = vorrq_u8(x3, hbits1); + } + { + int8x16_t base = vdupq_n_s8(32); + int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128); + int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128); + int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128); + int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128); + int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128); + int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128); + int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128); + int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128); + int32x4_t p00 = vdupq_n_s32(0); + int32x4_t p01 = vdupq_n_s32(0); + int32x4_t p10 = vdupq_n_s32(0); + int32x4_t p11 = vdupq_n_s32(0); + int32x4_t p20 = vdupq_n_s32(0); + int32x4_t p21 = vdupq_n_s32(0); + int32x4_t p30 = vdupq_n_s32(0); + int32x4_t p31 = vdupq_n_s32(0); + p00 = vdotq_s32(p00, vreinterpretq_s8_u8(x0), y0); + p01 = vdotq_s32(p01, vreinterpretq_s8_u8(x1), y1); + p10 = vdotq_s32(p10, vreinterpretq_s8_u8(x2), y2); + p11 = vdotq_s32(p11, vreinterpretq_s8_u8(x3), y3); + p20 = vdotq_s32(p20, vreinterpretq_s8_u8(x4), y4); + p21 = vdotq_s32(p21, vreinterpretq_s8_u8(x5), y5); + p30 = vdotq_s32(p30, vreinterpretq_s8_u8(x6), y6); + p31 = vdotq_s32(p31, vreinterpretq_s8_u8(x7), y7); + p00 = vpaddq_s32(p00, p01); + p10 = vpaddq_s32(p10, p11); + p20 = vpaddq_s32(p20, p21); + p30 = vpaddq_s32(p30, p31); + p00 = vpaddq_s32(p00, p10); + p20 = vpaddq_s32(p20, p30); + sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale16_1)), p00); + sumi = vmlaq_s32(sumi, vmovl_high_s16(scale16_1), p20); + } + { + acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0); + } + return; +} + +IQK_ALWAYS_INLINE void fusion_mul_mat_qX_K_q8_K_T_y1_d4k( + float32x4_t &acc, + const uint8_t *x_scale, + const uint8_t *x_qs, + float x_d, + float x_dmin, + const int8_t *y_qs, + const int16_t *y_bsums, + float y_d) +{ + float c0 = x_d * y_d; + float c1 = -x_dmin * y_d; + const int OFFSET = 1024; + __builtin_prefetch((x_scale + OFFSET + 0*64), 0, 0); + __builtin_prefetch((x_scale + OFFSET + 1*64), 0, 0); + + int16x8_t scale_min; + int16x8_t scale; + { + uint32_t utmp[4]; + const uint8_t * sc8 = (const uint8_t *)utmp; + make_q4_scales(x_scale, utmp); + int8x16_t ss = vld1q_s8((const int8_t *)sc8); + scale = vmovl_s8(vget_low_s8(ss)); + scale_min = vmovl_high_s8(ss); + } + { + int16x8_t q8s0 = vld1q_s16(y_bsums + 0); + int16x8_t q8s1 = vld1q_s16(y_bsums + 8); + q8s0 = vpaddq_s16(q8s0, q8s1); + int32x4_t b0 = vmull_s16(vget_low_s16(scale_min), vget_low_s16(q8s0)); + b0 = vmlal_high_s16(b0, scale_min, q8s0); + acc = vfmaq_n_f32(acc, vcvtq_f32_s32(b0), c1); + } + int32x4_t sumi = vdupq_n_s32(0); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + uint8x16_t x0, x1, x2, x3, x4, x5, x6, x7; + { + x0 = vld1q_u8(x_qs + 0*16 + 0*64); + x1 = vld1q_u8(x_qs + 1*16 + 0*64); + x4 = vld1q_u8(x_qs + 2*16 + 0*64); + x5 = vld1q_u8(x_qs + 3*16 + 0*64); + x2 = vshrq_n_u8(x0, 4); + x3 = vshrq_n_u8(x1, 4); + x6 = vshrq_n_u8(x4, 4); + x7 = vshrq_n_u8(x5, 4); + x0 = vandq_u8(x0, m4b); + x1 = vandq_u8(x1, m4b); + x4 = vandq_u8(x4, m4b); + x5 = vandq_u8(x5, m4b); + } + { + int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 0*128); + int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 0*128); + int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 0*128); + int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 0*128); + int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 0*128); + int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 0*128); + int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 0*128); + int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 0*128); + int32x4_t p0 = vdupq_n_s32(0); + int32x4_t p1 = vdupq_n_s32(0); + int32x4_t p2 = vdupq_n_s32(0); + int32x4_t p3 = vdupq_n_s32(0); + p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0); + p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2); + p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4); + p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6); + p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1); + p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3); + p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5); + p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7); + p0 = vpaddq_s32(p0, p1); + p2 = vpaddq_s32(p2, p3); + p0 = vpaddq_s32(p0, p2); + sumi = vmlaq_s32(sumi, vmovl_s16(vget_low_s16(scale)), p0); + } + { + x0 = vld1q_u8(x_qs + 0*16 + 1*64); + x1 = vld1q_u8(x_qs + 1*16 + 1*64); + x4 = vld1q_u8(x_qs + 2*16 + 1*64); + x5 = vld1q_u8(x_qs + 3*16 + 1*64); + x2 = vshrq_n_u8(x0, 4); + x3 = vshrq_n_u8(x1, 4); + x6 = vshrq_n_u8(x4, 4); + x7 = vshrq_n_u8(x5, 4); + x0 = vandq_u8(x0, m4b); + x1 = vandq_u8(x1, m4b); + x4 = vandq_u8(x4, m4b); + x5 = vandq_u8(x5, m4b); + } + { + int8x16_t y0 = vld1q_s8(y_qs + 0*16 + 1*128); + int8x16_t y1 = vld1q_s8(y_qs + 1*16 + 1*128); + int8x16_t y2 = vld1q_s8(y_qs + 2*16 + 1*128); + int8x16_t y3 = vld1q_s8(y_qs + 3*16 + 1*128); + int8x16_t y4 = vld1q_s8(y_qs + 4*16 + 1*128); + int8x16_t y5 = vld1q_s8(y_qs + 5*16 + 1*128); + int8x16_t y6 = vld1q_s8(y_qs + 6*16 + 1*128); + int8x16_t y7 = vld1q_s8(y_qs + 7*16 + 1*128); + int32x4_t p0 = vdupq_n_s32(0); + int32x4_t p1 = vdupq_n_s32(0); + int32x4_t p2 = vdupq_n_s32(0); + int32x4_t p3 = vdupq_n_s32(0); + p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x0), y0); + p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x2), y2); + p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x4), y4); + p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x6), y6); + p0 = vdotq_s32(p0, vreinterpretq_s8_u8(x1), y1); + p1 = vdotq_s32(p1, vreinterpretq_s8_u8(x3), y3); + p2 = vdotq_s32(p2, vreinterpretq_s8_u8(x5), y5); + p3 = vdotq_s32(p3, vreinterpretq_s8_u8(x7), y7); + p0 = vpaddq_s32(p0, p1); + p2 = vpaddq_s32(p2, p3); + p0 = vpaddq_s32(p0, p2); + sumi = vmlaq_s32(sumi, vmovl_high_s16(scale), p0); + } + { + acc = vfmaq_n_f32(acc, vcvtq_f32_s32(sumi), c0); + } +} + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx, nrc_y); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + +//#pragma GCC unroll 4 + for (int i = 0; i < nb; ++i) { +#ifdef GEMV_Q4K + if constexpr (nrc_y == 1 && std::is_same::value) { + fusion_mul_mat_qX_K_q8_K_T_y1_d6k( + acc[0], + deq.x[i].ql, + deq.x[i].qh, + deq.x[i].scales, + GGML_FP16_TO_FP32(deq.x[i].d), + q8.y[0][i].qs, + q8.y[0][i].bsums, + q8.y[0][i].d); + } else +#endif +#ifdef GEMV_Q6K + if constexpr (nrc_y == 1 && std::is_same::value) { + fusion_mul_mat_qX_K_q8_K_T_y1_d4k( + acc[0], + deq.x[i].scales, + deq.x[i].qs, + GGML_FP16_TO_FP32(deq.x[i].d), + GGML_FP16_TO_FP32(deq.x[i].dmin), + q8.y[0][i].qs, + q8.y[0][i].bsums, + q8.y[0][i].d); + } else +#endif + { + int32x4_t sumi[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); + + if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) { + deq.process_scales(i, q8, acc); + deq.prepare(i, 0); + deq.compute(q8, i, 0, sumi); + deq.prepare(i, 1); + deq.compute(q8, i, 1, sumi); + } else { + if constexpr (Dequantizer::num_blocks() == 8) { + auto scales = deq.new_block(i, q8, acc); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else if constexpr (Dequantizer::num_blocks() == 16) { + auto scales = deq.new_block(i, q8, acc); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else { + GGML_ASSERT(false); + } + } + +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); + } + } + +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } + } +} + +// ============================= i-quants + +struct DequantizerIQ4XS final : public BaseDequantizer { + + static int8x16_t load_values() { + static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + return vld1q_s8(iq4nl_values); + } + + DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + (void)q8; + (void)acc; + d = GGML_FP16_TO_FP32(x[i].d); + const uint16_t scales_h = x[i].scales_h; + const uint16_t * scales_l = (const uint16_t *)x[i].scales_l; + aux32[0] = scales_l[0] | (scales_l[1] << 16); + aux32[1] = aux32[0] >> 4; + // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7 + uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf)); + uint16_t * aux16 = (uint16_t *)aux32; + aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2; + // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7 + uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30)); + int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32)); + // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7 + scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff)); + int16x8_t scales16 = vmovl_s8(scales8); + int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))}; + return scales; + } + inline void prepare(int i, int j) { + bits.prepare16(x[i].qs+64*j); + for (int k = 0; k < 4; ++k) { + bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k])); + bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k])); + } + } + + Q4bits bits; + const int8x16_t values; + uint32_t aux32[2]; + + constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602}; + + float d; +}; + +struct SimpleBits { + uint8x16x4_t b1; + uint8x16x4_t b2; +}; + +IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) { + int32x4x2_t scales; + auto one = vdupq_n_u32(1); + scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1)); + scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1)); + return scales; +} + +inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) { + auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127)))); + auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127)))); + b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1)); + b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2)); +} + +IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) { + return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1)); +} + +struct DequantizerIQ2XXS final : public BaseDequantizer { + DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); } + + inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { + auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j)); + prepare_all(data, q); + return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1])); + } + +private: + + static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) { + const uint8_t * idx = (const uint8_t *)bits; + b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]}); + b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]}); + apply_signs_2(b, signs, bits[1]); + } + + inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) { + const uint32_t * q2 = (const uint32_t *)data.val; + prepare2(quants+0, q2+0, keven_signs); + prepare2(quants+2, q2+2, keven_signs); + prepare2(quants+4, q2+4, keven_signs); + prepare2(quants+6, q2+6, keven_signs); + } +}; + +inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) { + auto aux = vld1_u8(sc); + auto scales_l = vand_u8(aux, vdup_n_u8(0xf)); + auto scales_h = vshr_n_u8(aux, 4); + auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + + auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1))); + int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) }; + return make_wider(scales16); +} + +struct DequantizerIQ2XS final : public BaseDequantizer { + DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x4_t new_block(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + prepare_internal(i, 0); + return prepare_4bit_scales16(x[i].scales); + } + + inline void prepare(int i, int j) { + if (j == 1) prepare_internal(i, 1); + } + +private: + + static void make2(const uint16_t * qs, uint8x16_t * b) { + auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511)))); + auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511)))); + auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9)))); + auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9)))); + b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1)); + b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2)); + } + + inline static void make4(const uint16_t * qs, uint8x16_t * b) { + make2(qs + 0, b + 0); + make2(qs + 4, b + 2); + } + + IQK_ALWAYS_INLINE void prepare_internal(int i, int j) { + make4(x[i].qs + 16*j + 0, bits.b1.val); + make4(x[i].qs + 16*j + 8, bits.b2.val); + } + +}; + +// So, I hate to include this table, but with the GCC 12.3 compiler +// bundled in the Cosmopolitan tools, loading the unpacked sign bytes +// from this table using the packed 8 sign bits as index is faster than +// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to +// expand the bits to bytes. +static const uint64_t kall_signs[256] = { + 0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff, + 0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff, + 0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff, + 0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff, + 0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff, + 0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff, + 0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff, + 0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff, + 0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff, + 0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff, + 0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff, + 0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff, + 0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff, + 0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff, + 0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff, + 0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff, + 0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff, + 0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff, + 0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff, + 0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff, + 0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff, + 0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff, + 0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff, + 0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff, + 0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff, + 0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff, + 0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff, + 0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff, + 0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff, + 0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff, + 0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff, + 0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff, + 0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff, + 0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff, + 0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff, + 0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff, + 0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff, + 0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff, + 0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff, + 0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff, + 0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff, + 0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff, + 0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff, + 0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff, + 0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff, + 0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff, + 0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff, + 0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff, + 0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff, + 0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff, + 0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff, + 0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff, + 0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff, + 0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff, + 0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff, + 0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff, + 0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff, + 0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff, + 0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff, + 0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff, + 0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff, + 0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff, + 0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff, + 0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff, +}; + +struct SignHelper { + + IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const { + auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]}); + // Normally we would expect this to be faster, but it isn't. + // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1])); + // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1)); + b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s)); + } + + // We would need these two if we weren't loading from the unpacked sign table. + //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201)); + //const uint8x16_t m1 = vdupq_n_u8(1); +}; + +struct DequantizerIQ2S final : public BaseDequantizer { + DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x4_t new_block(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + prepare_internal(i, 0, bits); + return prepare_4bit_scales16(x[i].scales); + } + + inline void prepare(int i, int j) { + if (j == 1) prepare_internal(i, 1, bits); + } + +private: + + static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) { + uint32_t aux32[2]; + const uint16_t * aux16 = (const uint16_t *)aux32; + for (int k = 0; k < 2; ++k) { + aux32[1] = (qh[k] << 4) | (qh[k] << 18); + aux32[0] = (aux32[1] << 4) & 0x03000300; + aux32[1] &= 0x03000300; + b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))), + vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1])))); + b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))), + vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3])))); + sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2; + sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2; + } + } + + void prepare_internal(int i, int j, SimpleBits& sb) { + + const auto * qs = x[i].qs + 16*j; + const auto * qh = x[i].qh + 4*j; + const auto * sign_bits = qs + QK_K/8; + + make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val); + make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val); + } + + SignHelper sh; +}; + +struct DequantizerIQ3XXS final : public BaseDequantizer { + DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); } + + inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { + auto q3data = vld1q_u8_x2(x[i].qs + 32*j); + auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j)); + prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q); + return prepare_scales_8(gas); + } + +private: + + inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) { + b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]}); + b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]}); + apply_signs_2(b, keven_signs, sidx); + } + inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) { + make2(q3+ 0, signs[0], quants + 0); + make2(q3+ 8, signs[1], quants + 2); + make2(q3+16, signs[2], quants + 4); + make2(q3+24, signs[3], quants + 6); + } +}; + +struct DequantizerIQ3S final : public BaseDequantizer { + DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x2_t new_block(int i) { + d = GGML_FP16_TO_FP32(x[i].d); + uint32_t scales32[2]; + auto qs = vld1q_u8_x2(x[i].qs); + auto signs = vld1q_u8(x[i].signs); + + prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs); + + std::memcpy(scales32, x[i].scales, 4); + scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; + scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; + auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7 + scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400))); + auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8)); + int32x4x2_t scales; + scales.val[0] = vmovl_s16(vget_low_s16(scales16)); + scales.val[1] = vmovl_s16(vget_high_s16(scales16)); + return scales; + } + + inline void prepare(int i, int j) { + if (j == 1) { + auto qs = vld1q_u8_x2(x[i].qs + 32); + auto signs = vld1q_u8(x[i].signs + 16); + prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs); + } + } + +private: + + static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh, + const int16x8_t& hshift, uint8x16_t * b) { + auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256))); + const uint16_t * idx = (const uint16_t *)&vindex; + b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]}); + sh.apply_signs_1x(b+0, sign_bits+0); + b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]}); + sh.apply_signs_1x(b+1, sign_bits+2); + } + static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, + const int16x8_t& hshift, uint8x16_t * b) { + auto idx_l = vld1q_u8(qs); + make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0); + make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2); + } + + static int16x8_t load_shift() { + static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; + return vld1q_s16(k_shift); + } + + inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) { + auto signs = vld1q_u8(sign_bits); + auto s = (const uint8_t *)&signs; + make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val); + make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val); + } + + SignHelper sh; + const int16x8_t hshift = load_shift(); + +}; + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + Dequantizer deq(vx, bx, nrc_y); + uint8x16_t qx[8]; + int32x4_t sumi[nrc_y]; + float32x4_t acc[nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb; ++i) { + float d = deq.new_block(i); + auto scales = deq.unpack(i, 0, qx); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + sumi[iy] = vdupq_n_s32(0); + compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]); + } + scales = deq.unpack(i, 1, qx); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy])); + } + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +// =========================================== Legacy quants + +template +inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) { + for (int k = 0; k < 4; ++k) aux[k] = x[k].d; + return vld1_f16((const float16_t *)aux); +} + +template +inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) { + if constexpr (std::is_same_v) { + for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; } + } else { + for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; } + } + return vld1q_f16((const float16_t *)aux); +} + +struct Q4LegacyBits { + template + inline void prepare(const Block * x) { + for (int i = 0; i < 4; ++i) { + auto q4bits = vld1q_u8(x[i].qs); + b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); + b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); + } + } + inline void prepare1(const uint8_t * qs, int8x16_t * q) const { + auto q4bits = vld1q_u8(qs); + q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); + q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); + } + inline void prepare1(const uint8_t * qs) { + prepare1(qs, b); + } + const uint8x16_t m4b = vdupq_n_u8(0xf); + int8x16_t b[8]; +}; + +// One would think this commented out version would do better than the one below +// because it offers more opportunities to execute instructions in parallel. +// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers +// cannot it just do the sequential version below on its own? +//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { +// const auto q8b_1 = vld1q_s8_x2(qs + 0); +// auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]); +// const auto q8b_2 = vld1q_s8_x2(qs + 32); +// auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]); +// auto p1234 = vpaddq_s32(p12, p34); +// const auto q8b_3 = vld1q_s8_x2(qs + 64); +// auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]); +// const auto q8b_4 = vld1q_s8_x2(qs + 96); +// auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]); +// return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); +//} + +inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { + auto q8b = vld1q_s8_x2(qs + 0); + auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]); + q8b = vld1q_s8_x2(qs + 32); + auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]); + auto p1234 = vpaddq_s32(p12, p34); + q8b = vld1q_s8_x2(qs + 64); + auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]); + q8b = vld1q_s8_x2(qs + 96); + auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]); + return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); +} + +typedef struct { + ggml_half d[4]; + int8_t qs[4*QK8_0]; +} block_q8_0_x4; +static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding"); + +template struct Q80 { + + constexpr static int nrc_y = nrc; + + Q80(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy); + } + + inline const int8_t * quant_data(int iy, int i) const { + const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + return y4->qs; + } + + inline float16x4_t load_scales(int iy, int i) const { + const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + return vld1_f16((const float16_t *)y4->d); + } + + template + inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const { + auto qx_scales = deq.new_block(i); + for (int iy = 0; iy < nrc; ++iy) { + auto q8_scales = load_scales(iy, i); + sc16[iy] = vmul_f16(qx_scales, q8_scales); + } + } + + template + inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { + deq.prepare1(i); + float d = GGML_FP16_TO_FP32(deq.x[i].d); + for (int iy = 0; iy < nrc; ++iy) { + auto q8b = vld1q_s8_x2(y[iy][i].qs); + auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); + } + } + + const block_q8_0 * y[nrc_y]; +}; + +typedef struct { + ggml_half d[8]; + int8_t qs[4*QK8_1]; +} block_q8_1_x4; +static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding"); + +template struct Q81 { + + constexpr static int nrc_y = nrc; + + Q81(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy); + } + + inline const int8_t * quant_data(int iy, int i) const { + const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + return y4->qs; + } + + inline float16x8_t load_scales(int iy, int i) const { + const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + return vld1q_f16((const float16_t *)y4->d); + } + + template + inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const { + auto qx_scales = deq.new_block(i); + for (int iy = 0; iy < nrc; ++iy) { + auto q8_scales = load_scales(iy, i); + auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales)); + acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m)); + sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales)); + } + } + + template + inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { + deq.prepare1(i); + float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m); + for (int iy = 0; iy < nrc; ++iy) { + auto q8b = vld1q_s8_x2(y[iy][i].qs); + auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); + acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s))); + } + } + + const block_q8_1 * y[nrc_y]; +}; + +template +struct BaseLegacyDequantizer { + + BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {} + + inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); } + + Q4LegacyBits bits; + + const void * vx; + const block_q * x; + size_t bx; +}; + +struct DequantizerQ40 final : public BaseLegacyDequantizer { + + DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + q[0] = vaddq_s8(q[0], m8); + q[1] = vaddq_s8(q[1], m8); + } + inline void prepare1(int i) { + prepare1(i, bits.b); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + prepare1(4*i+k, bits.b + 2*k); + } + return vld1_f16((const float16_t *)aux); + } + + const int8x16_t m8 = vdupq_n_s8(-8); + //ggml_half aux[4]; +}; + +struct DequantizerQ41 : public BaseLegacyDequantizer { + + DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i) { + bits.prepare1(x[i].qs); + } + + inline float16x8_t new_block(int i) { + uint32_t aux32[4]; + const uint32_t * s32 = (const uint32_t *)&x[4*i].d; + for (int k = 0; k < 4; ++k) { + aux32[k] = *s32; s32 += sizeof(block_q4_1)/4; + bits.prepare1(x[4*i+k].qs, bits.b + 2*k); + } + return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); + } + // Leaving this commented out attempt to be reminded that I already tried this. + // It has basically the same performance as the version above. + //inline float16x8_t new_block(int i) { + // uint32x4_t scales = {}; + // const block_q4_1 * xi = x + 4*i; + // const uint32_t * s32 = (const uint32_t *)&xi->d; + // scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[0].qs, bits.b + 0); + // scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[1].qs, bits.b + 2); + // scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[2].qs, bits.b + 4); + // scales = vsetq_lane_u32(*s32, scales, 3); + // bits.prepare1(xi[3].qs, bits.b + 6); + // return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle))); + //} + + const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; +}; + +struct HighBit5Legacy { + inline uint8x16_t to_bytes(const uint8_t * qh) const { + uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); + return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask)); + } + inline uint8x16_t to_negated_bytes(const uint8_t * qh) const { + uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); + return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0)); + } + const uint64x2_t mask = vdupq_n_u64(0x8040201008040201); + const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1)); +}; + +struct DequantizerQ50 final : public BaseLegacyDequantizer { + + DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + auto qh = x[i].qh; + q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0)))); + q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2)))); + } + inline void prepare1(int i) { + prepare1(i, bits.b); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + prepare1(4*i+k, bits.b + 2*k); + } + return vld1_f16((const float16_t *)aux); + } + + HighBit5Legacy hbits; + + const uint8x16_t mh = vdupq_n_u8(0xf0); + +}; + +struct DequantizerQ80 final : public BaseLegacyDequantizer { + + DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i) { + bits.b[0] = vld1q_s8(x[i].qs); + bits.b[1] = vld1q_s8(x[i].qs+16); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs); + bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16); + } + return vld1_f16((const float16_t *)aux); + } + +}; + +struct DequantizerQ51 final : public BaseLegacyDequantizer { + + DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + auto qh = x[i].qh; + q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0)))); + q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2)))); + } + inline void prepare1(int i) { + bits.prepare1(x[i].qs, bits.b); + } + + inline float16x8_t new_block(int i) { + uint32_t aux32[4]; + const uint32_t * s32 = (const uint32_t *)&x[4*i].d; + for (int k = 0; k < 4; ++k) { + aux32[k] = *s32; s32 += sizeof(block_q5_1)/4; + prepare1(4*i+k, bits.b + 2*k); + } + return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); + } + + HighBit5Legacy hbits; + + const uint8x16_t mh = vdupq_n_u8(0x10); + const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; + +}; + +template +inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i)); + auto scale = vcvt_f32_f16(sc16[iy]); + acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall)); + } +} + +template +inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) { + const int nb = n / QK4_1; + + float16x4_t sc16[Q8::nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[Q8::nrc_y]; + for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb/4; ++i) { + q8.process_scales(i, deq, sc16, acc); + sum_4(i, deq, q8, sc16, acc); + } + for (int i = 4*(nb/4); i < nb; ++i) { + q8.process_1_block(i, deq, acc); + } + + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +template +inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) { + const int nb = n / QK4_1; + + float16x4_t sc16[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq1.new_row(ix); + deq2.new_row(ix); + + float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) }; + + for (int i = 0; i < nb/8; ++i) { + q8.process_scales(2*i+0, deq1, sc16+0, acc+0); + q8.process_scales(2*i+1, deq2, sc16+1, acc+1); + sum_4(2*i+0, deq1, q8, sc16+0, acc+0); + sum_4(2*i+1, deq2, q8, sc16+1, acc+1); + } + for (int i = 2*(nb/8); i < nb/4; ++i) { + q8.process_scales(i, deq1, sc16, acc); + sum_4(i, deq1, q8, sc16, acc); + } + for (int i = 4*(nb/4); i < nb; ++i) { + q8.process_1_block(i, deq1, acc); + } + + info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1]))); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Q81 q8(info); + if constexpr (nrc_y == 1) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); + } else { + Dequantizer deq(vx, bx); + mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Q80 q8(info); + if constexpr (nrc_y == 1) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); + } else { + Dequantizer deq(vx, bx); + mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + Q81<1> q8(info); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); +} + +template +static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + Q80<1> q8(info); + mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x); +} + +template void MulMat::set_functions(MulMat& m) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_0_q8_0; + m.funcs[1] = mul_mat_qX_0_q8_0; + m.funcs[2] = mul_mat_qX_0_q8_0; + m.funcs[3] = mul_mat_qX_0_q8_0; + m.funcs[4] = mul_mat_qX_0_q8_0; + m.funcs[5] = mul_mat_qX_0_q8_0; + m.funcs[6] = mul_mat_qX_0_q8_0; + m.funcs[7] = mul_mat_qX_0_q8_0; + } + else if constexpr (std::is_same_v || std::is_same_v) { + m.funcs[0] = mul_mat_qX_1_q8_1; + m.funcs[1] = mul_mat_qX_1_q8_1; + m.funcs[2] = mul_mat_qX_1_q8_1; + m.funcs[3] = mul_mat_qX_1_q8_1; + m.funcs[4] = mul_mat_qX_1_q8_1; + m.funcs[5] = mul_mat_qX_1_q8_1; + m.funcs[6] = mul_mat_qX_1_q8_1; + m.funcs[7] = mul_mat_qX_1_q8_1; + } + else if constexpr (std::is_same_v || std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>; + } + else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>; + } + else { + m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>; + m.funcs_v2 = mul_mat_qX_K_q8_K_T_v2; + } +} + +bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) { + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + + (void)Ny; + // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications. + //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S || + // typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false; + + switch (typeA) { + case GGML_TYPE_Q2_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q3_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q4_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q5_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q6_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ4_XS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ3_S: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ3_XXS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_S: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_XS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_XXS: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q4_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + case GGML_TYPE_Q4_1: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); + break; + case GGML_TYPE_Q5_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + case GGML_TYPE_Q5_1: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); + break; + case GGML_TYPE_Q8_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + default: + return false; + } + return true; +} + +} + +#endif // __x86_64__ or __aarch64__ \ No newline at end of file diff --git a/third_party/llamafile/iqk_mul_mat_arm80.cpp b/third_party/llamafile/iqk_mul_mat_arm80.cpp new file mode 100644 index 0000000..16ef411 --- /dev/null +++ b/third_party/llamafile/iqk_mul_mat_arm80.cpp @@ -0,0 +1,10 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm80.cpp +// Copyright 2024 Iwan Kawrakow. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +#ifdef __aarch64__ +#define iqk_mul_mat iqk_mul_mat_arm80 +#define iqk_mul_mat_moe iqk_mul_mat_moe_arm80 +#include "iqk_mul_mat.inc" +#endif // __aarch64__ \ No newline at end of file diff --git a/third_party/llamafile/iqk_mul_mat_x86.inc b/third_party/llamafile/iqk_mul_mat_x86.inc new file mode 100644 index 0000000..a4e8c41 --- /dev/null +++ b/third_party/llamafile/iqk_mul_mat_x86.inc @@ -0,0 +1,4925 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc +// Copyrigth 2024 Iwan Kawrakow - Apache 2.0 Licens +// with additions from +// https://github.com/ikawrakow/ik_llama.cpp/blob/main/ggml/src/iqk/iqk_mul_mat.cpp +// Copyrigth 2024-2025 Iwan Kawrakow - MIT Licens +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp fenc=utf-8 :vi +// +// Copyright 2024 Iwan Kawrakow +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// +// Copyright (C) 2024-2025 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include +#include +#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64) + +#include "llama.cpp/ggml-impl.h" +#include "llama.cpp/ggml-quants.h" +#include "sgemm.h" + +// For i-quants, I had to explicitely specify which +// functions to inline / not inline (at least for some +// of the functions), else performance would be significantly +// lower. This is worrysome as things can change with, +// e.g., a different compiler version or running on a different +// CPU. +#ifdef _MSC_VER +#define IQK_NOINLINE __declspec(noinline) +#define IQK_ALWAYS_INLINE inline +#else +#define IQK_NOINLINE __attribute__((__noinline__)) +#define IQK_ALWAYS_INLINE __attribute__((always_inline)) +#endif + +#define GGML_COMMON_IMPL_C +#include "llama.cpp/ggml-common.h" + +// clang-format off + +// This matrix - vector and matrix - matrix multiplication implementation +// for legacy quants, k-quants and i-quants makes prompt processing 150-200% +// (legacy and k-quants) or 250-400% (i-quants) faster. +// compared to mainline llama.cpp (and llamafile). +// It provides implementations for ARM_NEON (all quants) and AVX2 +// (all quants except sub-4 bit i-quants). +// +// Main idea is that unpacking the quants and the block scales to +// be ready for dot products with the corresponding Q8_Y quants +// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type). +// Hence, if we are performing a QX x Q8_Y matrix matrix +// multiplication (as needed for prompt processing), we can get +// a significant speedup by reusing the unpacked QX quants and scales +// for multiplication with several Q8_K columns. We also achieve fewer +// loads from memory, which is the main purpose of tiling in general +// purpose matrix multiplication packages. + +#include +#include + +#endif + +constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast(98); +constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast(99); + + +namespace { + +typedef struct { + int32_t i1; + int32_t i2; +} mmid_row_mapping; + +struct DataInfo { + float * s; + const char * cy; + size_t bs; + size_t by; + int cur_y = 0; + int ne11; + const mmid_row_mapping * row_mapping = nullptr; + size_t bs2 = 0; + + inline const char * src1_row(int iy) const { + if (!row_mapping) return cy + (cur_y + iy)*by; + int i11 = row_mapping[cur_y + iy].i1 % ne11; + int i12 = row_mapping[cur_y + iy].i2; + return cy + (i11 + i12*ne11)*by; + } + + inline void store(int ix, int iy, float result) const { + *(dst_row(iy) + ix) = result; + //dst_row(iy)[ix] = result; + } + inline float * dst_row(int iy) const { + if (!row_mapping) return s + (cur_y + iy)*bs; + int i12 = row_mapping[cur_y + iy].i2; + int i1 = row_mapping[cur_y + iy].i1; + int i2 = i12; + return s + i1*bs + i2*bs2; + } +}; + +/* +moonll +change param for set_mul_mat +add func16 +*/ + +typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x); + +struct MulMat { + std::array funcs = {}; + mul_mat_t func16 = nullptr; + //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { + IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) { + constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small) + + // copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L162 + // MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow + if (func16 && nrc_y >= 16) { + int n_step = (nrc_y - info.cur_y)/16; + for (int ix = 0; ix < nrc_x; ix += k_x_step) { + auto this_info = info; + this_info.s += ix; + int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; + for (int iy = 0; iy < n_step; ++iy) { + func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); + this_info.cur_y += 16; + } + } + info.cur_y += 16 * n_step; + if (info.cur_y == nrc_y) return; + } + // end copy + + int n_step = (nrc_y - info.cur_y)/funcs.size(); + if (n_step > 0) { + for (int ix = 0; ix < nrc_x; ix += k_x_step) { + auto this_info = info; + this_info.s += ix; + int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix; + for (int iy = 0; iy < n_step; ++iy) { + funcs.back()(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x); + this_info.cur_y += funcs.size(); + } + } + info.cur_y += funcs.size() * n_step; + } + int n_left = nrc_y - info.cur_y; + if (n_left > 0) { + funcs[n_left-1](n, vx, bx, info, nrc_x); + } + } + static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny); +private: + template static IQK_NOINLINE void set_functions(MulMat& m); +}; + +inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) { + const uint16_t * scales = (const uint16_t *)scales8; + const uint32_t a0 = scales[0] | (scales[1] << 16); + const uint32_t a1 = scales[2] | (scales[3] << 16); + const uint32_t a2 = scales[4] | (scales[5] << 16); + aux32[3] = ((a2 >> 4) & 0x0f0f0f0f) | ((a1 >> 2) & 0x30303030); + aux32[1] = ((a2 >> 0) & 0x0f0f0f0f) | ((a0 >> 2) & 0x30303030); + aux32[2] = a1 & 0x3f3f3f3f; + aux32[0] = a0 & 0x3f3f3f3f; +} + +/* +moonll +decoding tables +*/ +// copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +#ifdef __AVX2__ +static const uint64_t iq1s_grid_us[2048] = { + 0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200, + 0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000, + 0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101, + 0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101, + 0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202, + 0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200, + 0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001, + 0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202, + 0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201, + 0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001, + 0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101, + 0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101, + 0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202, + 0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200, + 0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201, + 0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002, + 0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101, + 0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200, + 0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102, + 0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101, + 0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001, + 0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100, + 0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200, + 0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101, + 0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100, + 0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000, + 0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202, + 0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200, + 0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101, + 0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201, + 0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002, + 0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001, + 0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001, + 0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002, + 0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000, + 0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101, + 0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000, + 0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101, + 0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202, + 0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201, + 0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000, + 0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100, + 0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102, + 0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002, + 0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000, + 0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101, + 0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101, + 0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200, + 0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002, + 0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001, + 0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101, + 0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101, + 0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101, + 0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102, + 0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100, + 0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002, + 0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100, + 0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000, + 0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101, + 0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101, + 0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001, + 0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102, + 0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201, + 0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202, + 0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001, + 0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001, + 0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101, + 0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102, + 0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200, + 0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, + 0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101, + 0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000, + 0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201, + 0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101, + 0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202, + 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102, + 0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101, + 0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100, + 0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002, + 0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201, + 0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101, + 0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002, + 0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202, + 0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101, + 0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000, + 0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100, + 0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102, + 0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102, + 0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101, + 0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101, + 0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001, + 0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201, + 0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002, + 0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001, + 0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100, + 0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101, + 0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001, + 0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101, + 0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000, + 0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001, + 0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101, + 0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101, + 0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000, + 0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001, + 0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001, + 0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102, + 0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102, + 0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101, + 0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201, + 0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202, + 0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202, + 0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101, + 0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001, + 0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000, + 0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101, + 0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200, + 0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100, + 0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100, + 0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202, + 0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102, + 0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201, + 0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202, + 0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002, + 0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001, + 0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001, + 0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101, + 0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202, + 0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201, + 0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102, + 0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200, + 0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001, + 0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101, + 0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201, + 0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001, + 0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002, + 0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000, + 0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202, + 0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201, + 0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201, + 0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101, + 0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100, + 0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000, + 0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101, + 0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202, + 0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101, + 0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202, + 0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202, + 0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201, + 0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002, + 0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102, + 0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102, + 0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000, + 0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000, + 0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101, + 0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101, + 0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202, + 0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200, + 0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102, + 0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101, + 0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100, + 0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001, + 0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100, + 0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101, + 0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001, + 0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200, + 0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, + 0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101, + 0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100, + 0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101, + 0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101, + 0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101, + 0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202, + 0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100, + 0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201, + 0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202, + 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102, + 0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200, + 0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201, + 0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000, + 0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002, + 0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100, + 0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000, + 0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100, + 0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000, + 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102, + 0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100, + 0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002, + 0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001, + 0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201, + 0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202, + 0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100, + 0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001, + 0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002, + 0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001, + 0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201, + 0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001, + 0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101, + 0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101, + 0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101, + 0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101, + 0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102, + 0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100, + 0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001, + 0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000, + 0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001, + 0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101, + 0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100, + 0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000, + 0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202, + 0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101, + 0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100, + 0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100, + 0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200, + 0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100, + 0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, + 0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101, + 0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201, + 0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001, + 0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201, + 0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201, + 0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001, + 0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200, + 0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100, + 0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201, + 0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200, + 0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, + 0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001, + 0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102, + 0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001, + 0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201, + 0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100, + 0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000, + 0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102, + 0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001, + 0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202, + 0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102, + 0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101, + 0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201, + 0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101, + 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102, + 0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101, + 0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100, + 0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202, + 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, + 0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202, + 0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101, + 0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200, + 0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101, + 0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100, + 0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002, + 0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201, + 0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100, + 0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202, + 0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102, + 0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002, + 0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200, + 0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002, + 0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200, + 0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001, + 0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200, + 0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100, + 0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000, + 0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102, + 0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100, + 0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000, + 0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102, + 0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100, + 0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000, + 0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101, + 0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001, + 0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201, + 0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002, + 0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200, + 0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100, + 0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101, + 0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202, + 0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002, + 0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201, + 0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201, + 0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001, + 0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202, + 0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102, + 0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002, + 0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201, + 0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200, + 0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002, + 0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100, + 0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101, + 0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102, + 0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002, + 0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200, + 0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100, + 0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001, + 0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100, + 0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201, + 0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101, + 0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102, + 0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201, + 0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200, + 0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200, + 0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002, + 0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202, + 0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102, + 0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000, + 0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202, + 0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201, + 0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001, + 0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002, + 0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102, + 0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001, + 0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101, + 0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202, + 0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102, + 0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201, + 0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101, + 0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101, + 0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001, + 0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202, + 0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000, + 0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202, + 0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102, + 0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002, + 0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201, + 0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101, + 0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001, + 0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200, + 0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102, + 0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102, + 0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100, + 0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001, + 0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201, + 0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001, + 0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202, + 0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200, + 0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000, + 0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000, + 0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001, + 0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200, + 0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200, + 0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202, + 0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201, + 0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202, + 0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001, + 0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001, + 0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200, + 0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000, + 0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102, + 0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101, + 0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100, + 0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000, + 0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100, + 0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100, + 0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102, + 0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201, + 0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202, + 0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102, + 0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102, + 0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202, + 0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202, + 0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100, + 0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000, + 0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101, + 0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202, + 0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102, + 0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100, + 0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101, + 0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100, + 0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201, + 0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101, + 0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202, + 0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200, + 0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201, + 0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200, + 0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002, + 0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201, + 0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101, + 0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201, + 0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201, + 0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102, + 0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101, + 0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101, + 0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101, + 0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001, + 0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000, + 0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102, + 0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101, + 0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202, + 0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202, + 0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101, + 0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000, + 0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101, + 0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202, + 0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100, + 0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000, + 0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101, + 0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202, + 0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100, + 0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100, + 0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002, + 0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100, + 0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101, + 0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202, + 0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200, + 0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100, + 0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200, + 0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002, + 0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001, + 0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101, + 0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101, + 0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202, + 0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102, + 0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100, + 0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101, + 0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100, + 0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101, + 0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101, + 0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101, + 0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101, + 0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102, + 0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100, + 0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102, + 0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101, + 0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101, + 0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001, + 0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101, + 0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202, + 0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102, + 0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001, + 0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102, + 0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200, + 0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101, + 0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001, + 0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201, + 0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202, + 0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102, + 0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002, + 0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200, + 0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100, + 0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001, + 0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002, + 0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201, + 0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101, + 0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100, + 0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000, + 0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200, + 0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101, + 0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200, + 0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202, + 0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100, + 0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102, + 0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102, + 0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102, + 0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101, + 0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101, + 0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000, + 0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202, + 0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102, + 0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200, + 0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101, + 0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101, + 0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100, + 0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202, + 0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101, + 0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201, + 0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001, + 0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101, + 0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200, + 0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002, + 0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001, + 0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000, + 0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101, + 0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202, + 0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100, + 0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102, + 0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200, + 0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101, + 0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201, + 0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000, + 0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202, + 0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201, + 0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200, + 0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002, + 0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101, + 0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100, + 0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001, + 0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201, + 0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000, + 0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102, + 0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001, + 0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201, + 0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100, + 0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002, + 0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001, + 0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101, + 0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002, + 0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000, + 0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101, + 0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100, + 0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200, + 0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200, + 0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102, + 0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200, + 0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002, + 0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100, + 0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001, + 0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001, + 0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102, + 0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202, + 0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202, + 0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000, + 0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101, + 0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202, +}; +#else +static const uint32_t iq1s_grid_us[2048] = { + 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000, + 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101, + 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200, + 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212, + 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011, + 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111, + 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220, + 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022, + 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220, + 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101, + 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110, + 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111, + 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010, + 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210, + 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221, + 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021, + 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002, + 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101, + 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101, + 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211, + 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110, + 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022, + 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121, + 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220, + 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001, + 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101, + 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102, + 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012, + 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010, + 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111, + 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122, + 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222, + 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001, + 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102, + 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101, + 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000, + 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101, + 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112, + 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110, + 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211, + 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012, + 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111, + 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120, + 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122, + 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121, + 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221, + 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001, + 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101, + 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101, + 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011, + 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111, + 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011, + 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122, + 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121, + 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222, + 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101, + 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000, + 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200, + 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110, + 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112, + 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222, + 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021, + 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121, + 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201, + 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200, + 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101, + 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011, + 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010, + 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211, + 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121, + 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000, + 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202, + 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202, + 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211, + 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112, + 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020, + 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121, + 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222, + 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102, + 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100, + 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110, + 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011, + 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111, + 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110, + 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121, + 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222, + 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201, + 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102, + 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201, + 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012, + 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010, + 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010, + 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110, + 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011, + 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212, + 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021, + 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021, + 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021, + 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101, + 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101, + 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100, + 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010, + 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111, + 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010, + 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111, + 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120, + 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120, + 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101, + 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001, + 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201, + 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210, + 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211, + 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111, + 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112, + 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211, + 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010, + 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021, + 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122, + 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221, + 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102, + 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100, + 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101, + 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101, + 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101, + 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012, + 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110, + 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112, + 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210, + 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210, + 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210, + 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010, + 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110, + 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122, + 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020, + 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021, + 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022, + 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120, + 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222, + 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221, + 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001, + 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102, + 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201, + 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012, + 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111, + 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012, + 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110, + 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110, + 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121, + 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221, + 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220, + 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222, + 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000, + 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201, + 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012, + 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011, + 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212, + 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221, + 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121, + 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202, + 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202, + 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002, + 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101, + 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210, + 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112, + 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011, + 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011, + 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210, + 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020, + 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220, + 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222, + 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222, + 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001, + 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010, + 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111, + 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010, + 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110, + 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221, + 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122, + 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202, + 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100, + 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101, + 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112, + 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111, + 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211, + 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222, + 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221, + 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022, + 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101, + 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211, + 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111, + 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111, + 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010, + 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121, + 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222, + 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000, + 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202, + 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000, + 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202, + 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110, + 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110, + 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222, + 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120, + 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022, + 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101, + 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202, + 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110, + 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110, + 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111, + 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111, + 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120, + 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121, + 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001, + 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202, + 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001, + 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200, + 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011, + 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212, + 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012, + 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110, + 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012, + 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111, + 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020, + 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121, + 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222, + 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102, + 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102, + 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101, + 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212, + 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210, + 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111, + 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212, + 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221, + 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121, + 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002, + 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000, + 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202, + 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112, + 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111, + 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020, + 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221, + 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022, + 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100, + 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201, + 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112, + 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211, + 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012, + 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121, + 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020, + 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120, + 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200, + 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200, + 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110, + 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011, + 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222, + 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020, + 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222, +}; +#endif +// end copy https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L570 + +#ifndef HAVE_FANCY_SIMD +const uint64_t keven_signs[128] = { + 0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff, + 0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff, + 0xff010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0xff010101ff01ffff, + 0x01010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0x01010101ffffffff, + 0xff0101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0xff0101ff0101ffff, + 0x010101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0x010101ff01ffffff, + 0x010101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0x010101ffff01ffff, + 0xff0101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0xff0101ffffffffff, + 0xff01ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0xff01ff010101ffff, + 0x0101ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0x0101ff0101ffffff, + 0x0101ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0x0101ff01ff01ffff, + 0xff01ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0xff01ff01ffffffff, + 0x0101ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0x0101ffff0101ffff, + 0xff01ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0xff01ffff01ffffff, + 0xff01ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0xff01ffffff01ffff, + 0x0101ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0x0101ffffffffffff, + 0xffff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0xffff01010101ffff, + 0x01ff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0x01ff010101ffffff, + 0x01ff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0x01ff0101ff01ffff, + 0xffff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0xffff0101ffffffff, + 0x01ff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0x01ff01ff0101ffff, + 0xffff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0xffff01ff01ffffff, + 0xffff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0xffff01ffff01ffff, + 0x01ff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0x01ff01ffffffffff, + 0x01ffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0x01ffff010101ffff, + 0xffffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0xffffff0101ffffff, + 0xffffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0xffffff01ff01ffff, + 0x01ffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0x01ffff01ffffffff, + 0xffffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0xffffffff0101ffff, + 0x01ffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0x01ffffff01ffffff, + 0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff, + 0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff, +}; +#endif + +} + +/* moonll change mulmat +add typeB and strideB +}*/ + +// Adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +bool iqk_mul_mat(long Nx, long Ny, long ne00, + int typeA, const void * A, long strideA, + int typeB, const void * B, long strideB, + float * C, long stride_C, int ith, int nth) { + + MulMat mm; + + if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) { + return false; + } + + size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA)); + size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB)); + + + auto nrc_x = (Nx + nth - 1)/nth; + auto first_x = ith*nrc_x; + if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; + + DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0}; + + mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); + + return true; +} +// end adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L406 + + +bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B, + float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) { + const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping; + assert(row_mapping != nullptr); + + MulMat mm; + int row_size_q8; + /* moonll + + if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) { + return false; + }*/ + int row_size_qx = ggml_row_size((ggml_type)typeA, ne00); + int nrc_x = (Nx + nth - 1)/nth; + int first_x = ith*nrc_x; + if (first_x + nrc_x > Nx) nrc_x = Nx - first_x; + DataInfo info{C + first_x, (const char *)B, nb1/sizeof(float), (size_t)row_size_q8, 0, ne11, row_mapping, nb2/sizeof(float)}; + mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny); + return true; +} + +#if defined __x86_64__ || defined(_M_X64) + +#if defined HAVE_FANCY_SIMD + #undef HAVE_FANCY_SIMD +#endif +#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) + #define HAVE_FANCY_SIMD +#endif +//#define HAVE_FANCY_SIMD + +namespace { + +inline float hsum_float_4(__m128 x) { + x = _mm_add_ps(x, _mm_movehl_ps(x, x)); + x = _mm_add_ss(x, _mm_movehdup_ps(x)); + return _mm_cvtss_f32(x); +} +inline float hsum_float_8(__m256 x) { + return hsum_float_4(_mm_add_ps(_mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1))); +} + +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + + +template struct Q8 { + + constexpr static int nrc_y = nrc; + + Q8(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); + } + +#ifdef HAVE_FANCY_SIMD + inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); } +#endif + inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); } + inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); } + inline float scale(int iy, int i) const { return y[iy][i].d; } + + const block_q8 * y[nrc_y]; +}; + +// Handles q4_K and q5_K scales/mins +struct Scales8K { + template + inline __m256i process_mins_and_scales(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { + make_q4_scales(data, utmp); + const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); + const __m128i mins128 = _mm256_extracti128_si256(mins_and_scales, 1); + accum_mins(mins128, q8, i, c, accd); + const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); + return MM256_SET_M128I(sc128, sc128); + } +#ifdef HAVE_FANCY_SIMD + template + inline __m512i process_mins_and_scales_64(const uint8_t * data, float c, int i, const Q8& q8, __m256 * accd) { + auto scales = process_mins_and_scales(data, c, i, q8, accd); + return _mm512_inserti32x8(_mm512_castsi256_si512(scales), scales, 1); + } +#endif + template + inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { + const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i q8s = q8.load_bsums(iy, i); + const __m256i prod = _mm256_madd_epi16(mins, q8s); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + } +#ifdef HAVE_FANCY_SIMD + const __m512i shuffles512[2] = { + _mm512_set_epi64(0x0706070607060706, 0x0302030203020302, 0x0706070607060706, 0x0302030203020302, + 0x0504050405040504, 0x0100010001000100, 0x0504050405040504, 0x0100010001000100), + _mm512_set_epi64(0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, 0x0f0e0f0e0f0e0f0e, 0x0b0a0b0a0b0a0b0a, + 0x0d0c0d0c0d0c0d0c, 0x0908090809080908, 0x0d0c0d0c0d0c0d0c, 0x0908090809080908) + }; +#endif + const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), + _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; + + uint32_t utmp[4]; +}; + +template +inline void process_mins_16(const __m256i& all_scales, const Q8& q8, int i, float d, __m256 * accm) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i prod = _mm256_madd_epi16(all_scales, q8.load_bsums(iy, i)); + accm[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d * q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accm[iy]); + } +} +inline void prepare_scales_16(const __m256i& all_scales, __m256i * scales) { + const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); + const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); + scales[0] = MM256_SET_M128I(l_scales, l_scales); + scales[1] = MM256_SET_M128I(h_scales, h_scales); +} + +struct ScaleQ3 { + inline __m128i make_scales(const uint16_t * s8) const { + const uint16_t * scales16 = (const uint16_t *)s8; + uint32_t aux0 = scales16[0] | (scales16[1] << 16); + uint32_t aux1 = scales16[2] | (scales16[3] << 16); + uint32_t aux2 = scales16[4] | (scales16[5] << 16); + __m128i scales128 = _mm_set_epi32( + ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030), + ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030), + (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030), + (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030)); + return _mm_add_epi8(scales128, m32); + } + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct ScaleIQ4XS { + inline __m128i make_scales(const uint32_t scales_l, const uint16_t scales_h) { + uint32_t tmp32 = scales_h | (scales_h << 14); + const __m128i sh = _mm_slli_epi16(_mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(tmp32), hshift), hmask), 4); + const __m128i sl = _mm_and_si128(_mm_srlv_epi32(_mm_set1_epi32(scales_l), lshift), lmask); + return _mm_add_epi16(_mm_or_si128(sh, _mm_cvtepi8_epi16(_mm_shuffle_epi8(sl, lshuffle))), m32); + } + const __m128i hshift = _mm_set_epi32(12, 8, 4, 0); + const __m128i lshift = _mm_set_epi32(4, 0, 4, 0); + const __m128i hmask = _mm_set1_epi16(0x03); + const __m128i lmask = _mm_set1_epi8(0xf); + const __m128i lshuffle = _mm_set_epi32(0x07030602, 0x05010400, 0x07030602, 0x05010400); + const __m128i m32 = _mm_set1_epi16(-32); +}; + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct Scales8KBase { + template + inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const { + const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0])); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i q8s = q8.load_bsums(iy, i); + const __m256i prod = _mm256_madd_epi16(mins, q8s); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + } + inline __m256i shuffle(__m128i mins) const { + return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0])); + } + const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100), + _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)}; +}; +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1455 + +template +struct BaseDequantizer { + BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {} + inline void new_row(int ix) { + x = (const Block *)((const char *)vx + bx*ix); + } + + const void * vx; + size_t bx; + const Block * x; + + float d; +}; + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +__m128i inline load_iq4nl_values_128() { + static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; + return _mm_loadu_si128((const __m128i *)kvalues_iq4nl); +} + +__m256i inline load_iq4nl_values_256() { + auto val128 = load_iq4nl_values_128(); + return MM256_SET_M128I(val128, val128); +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1698 + +#ifdef HAVE_FANCY_SIMD +//====================================== Zen4 ================================================== + +struct BlockPermuter { + const __m512i permute1 = _mm512_set_epi64(11, 10, 9, 8, 3, 2, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 13, 12, 7, 6, 5, 4); +}; + +struct Q4Bits { + inline void prepare(const uint8_t * q4) { + auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); + auto tmp1 = _mm512_and_si512(q4bits, ml); + auto tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + values[0] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); + values[1] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); + q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); + tmp1 = _mm512_and_si512(q4bits, ml); + tmp2 = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + values[2] = _mm512_permutex2var_epi64(tmp1, perm.permute1, tmp2); + values[3] = _mm512_permutex2var_epi64(tmp1, perm.permute2, tmp2); + } + inline void prepare64(const uint8_t * q4) { + auto q4bits = _mm512_loadu_si512((const __m512i*)q4 + 0); + values[0] = _mm512_and_si512(q4bits, ml); + values[1] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + q4bits = _mm512_loadu_si512((const __m512i*)q4 + 1); + values[2] = _mm512_and_si512(q4bits, ml); + values[3] = _mm512_and_si512(_mm512_srli_epi16(q4bits, 4), ml); + } + __m512i values[4]; + const __m512i ml = _mm512_set1_epi8(0xf); + BlockPermuter perm; +}; + +struct Q2Bits { + inline void prepare(const uint8_t * q2) { + + auto q2bits = _mm512_loadu_si512((const __m512i*)q2); + auto tmp = _mm512_srli_epi16(q2bits, 2); + + values[0] = _mm512_permutex2var_epi64(q2bits, perm.permute1, tmp); + values[2] = _mm512_permutex2var_epi64(q2bits, perm.permute2, tmp); + values[1] = _mm512_and_si512(_mm512_srli_epi16(values[0], 4), ml); + values[3] = _mm512_and_si512(_mm512_srli_epi16(values[2], 4), ml); + values[0] = _mm512_and_si512(values[0], ml); + values[2] = _mm512_and_si512(values[2], ml); + } + __m512i values[4]; + const __m512i ml = _mm512_set1_epi8(0x03); + BlockPermuter perm; +}; + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); + } + + Q4Bits bits; + Scales8K s8k; +}; + +/* +moonll DequantizerIQ4XS +*/ + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +__m512i inline load_iq4nl_values_512() { + auto val256 = load_iq4nl_values_256(); + return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1); +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1775 + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct DequantizerIQ4XS final : public BaseDequantizer { + // Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1782 + DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + prepare(x[i].qs); + auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); + s8k.accum_mins(scales128, q8, i, -128.f*d, accd); + auto scales256 = MM256_SET_M128I(scales128, scales128); + auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1); + scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]); + scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]); + scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]); + } + inline void prepare(const uint8_t * q4) { + bits.prepare64(q4); + // We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111 + // bits.valuse[1]: 16..31, 48...63, 80...95, 112..127 + // etc. + auto tmp = _mm512_permutex2var_epi64(bits.values[0], permute1, bits.values[1]); + bits.values[1] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[0], permute2, bits.values[1])); + bits.values[0] = _mm512_shuffle_epi8(values, tmp); + tmp = _mm512_permutex2var_epi64(bits.values[2], permute1, bits.values[3]); + bits.values[3] = _mm512_shuffle_epi8(values, _mm512_permutex2var_epi64(bits.values[2], permute2, bits.values[3])); + bits.values[2] = _mm512_shuffle_epi8(values, tmp); + } + + Q4Bits bits; + Scales8KBase s8k; + ScaleIQ4XS siq4; + const __m512i values; + const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); + const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); + const __m512i shuffles[4] = { + _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1), + _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1), + }; +}; +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1781 + +struct HighBit5 { + inline void apply(const uint8_t * h, Q4Bits& bits) { + auto hbits256 = _mm256_loadu_si256((const __m256i *)h); + auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(hbits, mh)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); + } + const __m512i mh = _mm512_set1_epi8(0x10); +}; + +struct HighBit3 { + inline void apply(const uint8_t * h, Q2Bits& bits) { + auto hbits256 = _mm256_loadu_si256((const __m256i *)h); + auto hbits = _mm512_inserti32x8(_mm512_castsi256_si512(hbits256), _mm256_srli_epi16(hbits256, 1), 1); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh)); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_and_si512(hbits, mh)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_and_si512(_mm512_srli_epi16(hbits, 4), mh)); + } + const __m512i mh = _mm512_set1_epi8(0x04); +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + hbits.apply(x[i].qh, bits); + auto all_scales = s8k.process_mins_and_scales_64(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]); + scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]); + } + + Q4Bits bits; + HighBit5 hbits; + Scales8K s8k; +}; + +struct Scale16 { + inline void make_scales(const __m128i& scales8, __m512i * scales) const { + auto all_scales8 = MM256_SET_M128I(scales8, scales8); + auto scales1 = _mm256_shuffle_epi8(all_scales8, shuffle1); + auto scales2 = _mm256_shuffle_epi8(all_scales8, shuffle2); + scales[0] = _mm512_cvtepi8_epi16(scales1); + scales[1] = _mm512_cvtepi8_epi16(scales2); + } + template + inline void process_mins_and_scales(int i, float c, const __m128i& mins8, const __m128i& scales8, + const Q8& q8, __m256 * accm, __m512i * scales) const { + process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, c, accm); + make_scales(scales8, scales); + } + const __m256i shuffle1 = _mm256_set_epi32(0x07070707, 0x03030303, 0x06060606, 0x02020202, + 0x05050505, 0x01010101, 0x04040404, 0x00000000); + const __m256i shuffle2 = _mm256_set_epi32(0x0f0f0f0f, 0x0b0b0b0b, 0x0e0e0e0e, 0x0a0a0a0a, + 0x0d0d0d0d, 0x09090909, 0x0c0c0c0c, 0x08080808); +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + sc16.process_mins_and_scales(i, -GGML_FP16_TO_FP32(x[i].dmin), mins8, scales8, q8, accm, scales); + } + + Q2Bits bits; + Scale16 sc16; + const __m128i m4 = _mm_set1_epi8(0xf); + +}; + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare(x[i].qs); + hbits.apply(x[i].hmask, bits); + auto scales128 = sc3.make_scales((const uint16_t *)x[i].scales); + sc16.process_mins_and_scales(i, -4.f*d, scales128, scales128, q8, accm, scales); + } + + Q2Bits bits; + HighBit3 hbits; + ScaleQ3 sc3; + Scale16 sc16; + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m512i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + bits.prepare64(x[i].ql); + add_high_bits(x[i].qh, bits); + auto scales128 = _mm_loadu_si128((const __m128i *)x[i].scales); + sc16.process_mins_and_scales(i, -32.f*d, scales128, scales128, q8, accm, scales); + } + + inline void add_high_bits(const uint8_t * qh, Q4Bits& bits) const { + auto hbits = _mm512_loadu_si512((const __m512i *)qh); + auto tmp1 = _mm512_and_si512(_mm512_slli_epi16(hbits, 4), mh); + auto tmp2 = _mm512_and_si512(_mm512_slli_epi16(hbits, 2), mh); + bits.values[0] = _mm512_or_si512(bits.values[0], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); + bits.values[2] = _mm512_or_si512(bits.values[2], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); + tmp1 = _mm512_and_si512(hbits, mh); + tmp2 = _mm512_and_si512(_mm512_srli_epi16(hbits, 2), mh); + bits.values[1] = _mm512_or_si512(bits.values[1], _mm512_permutex2var_epi64(tmp1, bits.perm.permute1, tmp2)); + bits.values[3] = _mm512_or_si512(bits.values[3], _mm512_permutex2var_epi64(tmp1, bits.perm.permute2, tmp2)); + } + + Q4Bits bits; + HighBit3 hbits; + Scale16 sc16; + + const __m512i mh = _mm512_set1_epi8(0x30); + +}; + +template +static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +template +inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); +} + +template +static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2)); + sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4)); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} + +template +static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accm[nrc_y]; + __m512 accd[nrc_y]; + __m512i scales[4]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps(); + for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accm, scales); + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0)); + const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1)); + const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2)); + const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3)); + auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(), + p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]); + accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1)); + info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256))); + } + + } +} + +template +static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + constexpr int k_nx = 2; + + Q8<1> q8(info); + + Dequantizer deq1(vx, bx); + Dequantizer deq2(vx, bx); + + Dequantizer * deq[k_nx]; + deq[0] = &deq1; + deq[1] = &deq2; + + __m512i scales[2*k_nx]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + auto accd = _mm512_setzero_ps(); + auto accm = _mm256_setzero_ps(); + + for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix); + + for (int i = 0; i < nb/k_nx; ++i) { + + for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx); + + for (int kx = 0; kx < k_nx; ++kx) { + compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd); + } + + } + if (2*(nb/2) < nb) { + int i0 = 2*(nb/2); + deq[0]->new_block(i0, q8, &accm, scales); + compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd); + } + + auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1)); + info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256))); + } +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L2408 + +#else +// ===================================== Vanilla AVX2 ===================================== + +struct Q4Bits { + inline void prepare(const uint8_t * q4, int j) { + auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); + values[0] = _mm256_and_si256(q4bits, ml); + values[1] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); + values[2] = _mm256_and_si256(q4bits, ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + } + inline void prepare64(const uint8_t * q4, int j) { + auto q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+0); + values[0] = _mm256_and_si256(q4bits, ml); + values[2] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + q4bits = _mm256_loadu_si256((const __m256i*)q4 + 2*j+1); + values[1] = _mm256_and_si256(q4bits, ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), ml); + } + inline void prepare16(const uint8_t * q4, int j) { + values[0] = dequant16(q4 + 64*j + 0); + values[1] = dequant16(q4 + 64*j + 16); + values[2] = dequant16(q4 + 64*j + 32); + values[3] = dequant16(q4 + 64*j + 48); + } + inline __m256i dequant16(const uint8_t * qs) const { + const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); + const __m256i aux256 = MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128); + return _mm256_and_si256(ml, aux256); + }; + __m256i values[4]; + const __m256i ml = _mm256_set1_epi8(0xf); +}; + +struct Q2Bits { + inline void prepare(const uint8_t * q2, int j) { + auto q2bits = _mm256_loadu_si256((const __m256i *)q2 + j); + values[0] = _mm256_and_si256(q2bits, ml); + values[1] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), ml); + values[2] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), ml); + values[3] = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), ml); + } + __m256i values[4]; + const __m256i ml = _mm256_set1_epi8(0x03); +}; + +struct HighBit5 { + inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } + inline void apply(Q4Bits& bits, bool do_shift) { + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 3), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); + if (do_shift) { + hbits = _mm256_srli_epi16(hbits, 4); + } + } + const __m256i mh = _mm256_set1_epi8(0x10); + __m256i hbits; +}; + +struct HighBit3 { + inline void load(const uint8_t * h) { hbits = _mm256_loadu_si256((const __m256i *)h); } + inline void apply(Q2Bits& bits, bool do_shift) { + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 1), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 1), mh)); + if (do_shift) { + hbits = _mm256_srli_epi16(hbits, 4); + } + } + const __m256i mh = _mm256_set1_epi8(0x04); + __m256i hbits; +}; + + +/* +template +inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { + if (j == 0) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); + } + } else { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); + } + } +}*/ + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + } + + Q4Bits bits; + Scales8K s8k; +}; + +struct DequantizerIQ4XS final : public BaseDequantizer { + DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + auto scales128 = siq4.make_scales(*(const uint32_t *)x[i].scales_l, x[i].scales_h); + s8k.accum_mins(scales128, q8, i, -128.f*d, accd); + return MM256_SET_M128I(scales128, scales128); + } + inline void prepare(int i, int j) { + bits.prepare16(x[i].qs, j); + bits.values[0] = _mm256_shuffle_epi8(values, bits.values[0]); + bits.values[1] = _mm256_shuffle_epi8(values, bits.values[1]); + bits.values[2] = _mm256_shuffle_epi8(values, bits.values[2]); + bits.values[3] = _mm256_shuffle_epi8(values, bits.values[3]); + } + + static __m256i load_values() { + static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241}; + auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + return MM256_SET_M128I(val128, val128); + } + + Q4Bits bits; + Scales8K s8k; + ScaleIQ4XS siq4; + const __m256i values; +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline __m256i new_block(int i, const Q8& q8, __m256 * accd) { + d = GGML_FP16_TO_FP32(x[i].d); + hbits.load(x[i].qh); + return s8k.process_mins_and_scales(x[i].scales, -GGML_FP16_TO_FP32(x[i].dmin), i, q8, accd); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + hbits.apply(bits, j == 0); + } + + Q4Bits bits; + HighBit5 hbits; + Scales8K s8k; +}; + +template +inline void process_mins_and_scales_16(const __m128i& scales128, const Q8& q8, int i, float d, + __m256 * accm, __m256i * scales) { + const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); + process_mins_16(all_scales, q8, i, d, accm); + prepare_scales_16(all_scales, scales); +} + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + hbits.load(x[i].hmask); + process_mins_and_scales_16(sc3.make_scales((const uint16_t *)x[i].scales), q8, i, -4.f*d, accm, scales); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + hbits.apply(bits, j == 0); + } + + Q2Bits bits; + HighBit3 hbits; + ScaleQ3 sc3; + + const __m128i m32 = _mm_set1_epi8(-32); +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + process_mins_16(_mm256_cvtepi8_epi16(mins8), q8, i, -GGML_FP16_TO_FP32(x[i].dmin), accm); + prepare_scales_16(_mm256_cvtepi8_epi16(scales8), scales); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs, j); + } + + Q2Bits bits; + + const __m128i m4 = _mm_set1_epi8(0xf); +}; + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + template + inline void new_block(int i, const Q8& q8, __m256 * accm, __m256i * scales) { + d = GGML_FP16_TO_FP32(x[i].d); + process_mins_and_scales_16(_mm_loadu_si128((const __m128i *)x[i].scales), q8, i, -32.f*d, accm, scales); + } + inline void prepare(int i, int j) { + bits.prepare64(x[i].ql, j); + auto hbits = _mm256_loadu_si256((const __m256i *)x[i].qh + j); + bits.values[0] = _mm256_or_si256(bits.values[0], _mm256_and_si256(_mm256_slli_epi16(hbits, 4), mh)); + bits.values[1] = _mm256_or_si256(bits.values[1], _mm256_and_si256(_mm256_slli_epi16(hbits, 2), mh)); + bits.values[2] = _mm256_or_si256(bits.values[2], _mm256_and_si256(hbits, mh)); + bits.values[3] = _mm256_or_si256(bits.values[3], _mm256_and_si256(_mm256_srli_epi16(hbits, 2), mh)); + } + + Q4Bits bits; + const __m256i mh = _mm256_set1_epi8(0x30); +}; + + +inline __m256i get_scale_shuffle_8(int i); + +inline void set_scales_8(const __m256i& all_scales, int j, __m256i* scales); + +inline __m256i get_scale_shuffle_16(int i); + +inline void set_scales_16(const __m256i& all_scales, __m256i* scales); + + +template +static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%QK_K == 0); + const int nb = n/QK_K; + + Q8 q8(info); + + __m256i all_scales[2]; + __m256i scales[4]; + __m256 accd[nrc_y]; + + Dequantizer deq(vx, bx); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + deq.new_block(i, q8, accd, all_scales); + + __m256i sumi[nrc_y]; + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j); + set_scales_16(all_scales[j], scales); + multiply_add(deq.bits, scales, j, i, q8, sumi); + } + + for (int iy = 0; iy < nrc_y; ++iy) { + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(iy, i)), _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + + } + +} + +template +static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx); + + __m256 accd[nrc_y]; + __m256i scales[4]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + auto all_scales = deq.new_block(i, q8, accd); + + __m256i sumi[nrc_y]; + + for (int j = 0; j < QK_K/128; ++j) { + + deq.prepare(i, j); + + set_scales_8(all_scales, j, scales); + + multiply_add(deq.bits, scales, j, i, q8, sumi); + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); + accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + + } +} +#endif // Zen4 or vanilla AVX2 + + + +// +// ============================== Legacy quants +// + +struct DotHelper { + const __m256i m1 = _mm256_set1_epi16(1); +#if defined(__AVX512VNNI__) && defined(__AVX512VL__) + inline __m256i dot(__m256i x, __m256i y) const { + return _mm256_dpbusd_epi32(_mm256_setzero_si256(), x, y); + } +#else + inline __m256i dot(__m256i x, __m256i y) const { + return _mm256_madd_epi16(m1, _mm256_maddubs_epi16(x, y)); + } +#endif +}; + +struct SignedDot { + DotHelper helper; + inline __m256i compute(__m256i x, __m256i y) const { + return helper.dot(_mm256_sign_epi8(x, x), _mm256_sign_epi8(y, x)); + } +}; +struct UnsignedDot { + DotHelper helper; + inline __m256i compute(__m256i x, __m256i y) const { + return helper.dot(x, y); + } +}; +template struct Sum4 { + Dot dot; + inline __m256i compute(const __m256i * qx, const Q8 * y) const { + const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); + const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); + const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); + const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); + const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1 + const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3 + return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3 + } +}; + +struct Sum4_Q8 { + SignedDot dot; + static inline __m256i add1(__m256i a, __m256i b) { + return _mm256_add_epi32(_mm256_unpacklo_epi32(a, b), _mm256_unpackhi_epi32(a, b)); + } + static inline __m256i add2(__m256i a, __m256i b) { + return _mm256_add_epi32(_mm256_unpacklo_epi64(a, b), _mm256_unpackhi_epi64(a, b)); + } + inline __m256i compute(const __m256i * qx, const block_q8_0 * y) const { + const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs)); + const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs)); + const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs)); + const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs)); + const __m256i p01 = add1(p0, p1); // 0,1, 0,1, 0,1, 0,1 + const __m256i p23 = add1(p2, p3); // 2,3, 2,3, 2,3, 2,3 + return add2(p01, p23); // returns 0,1,2,3, 0,1,2,3 + } +}; + +struct ScaleHelperQ_0 { + ggml_half scales8[4]; + template + inline __m128 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; + return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); + } + template + inline __m128 prepare4(__m128 other_scales, const Q * y) { + return _mm_mul_ps(other_scales, prepare4(y)); + } + template inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); } + template inline float prepare1(float d, const Q * y) const { return d*prepare1(y); } +}; +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +template +struct ScaleHelperQ_0_1 { + ggml_half scales8[4]; + template + inline __m256 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) scales8[j] = y[j].d; + auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8)); + return _mm256_set_m128(_mm_mul_ps(s4, min), s4); + } + template + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm_mul256_ps(other_scales, prepare4(y)); + } + template inline std::pair prepare1(const Q * y) const { + float d = GGML_FP16_TO_FP32(y->d); + return std::make_pair(d, -d*float(min_value)); + } + std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); + } + const __m128 min = _mm_set1_ps(float(-min_value)); +}; +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8187 + +struct ScaleHelperQ_1 { + uint32_t scales8[4]; + const __m128i shuffle = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100); + + template + inline __m256 prepare4(const Q * y) { + for (int j = 0; j < 4; ++j) { + // it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers + // complain that this breaks strict-aliasing rules. + memcpy(scales8 + j, &y[j].d, sizeof(uint32_t)); + } + return _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)scales8), shuffle)); + } + + template + inline __m256 prepare4(__m256 other_scales, const Q * y) { + return _mm256_mul_ps(other_scales, prepare4(y)); + } + + template inline std::pair prepare1(const Q * y) const { + return std::make_pair(GGML_FP16_TO_FP32(y->d), GGML_FP16_TO_FP32(y->m)); + } + template inline std::pair prepare1(const std::pair& dm, const Q * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->m)); + } + std::pair inline prepare1(const std::pair& dm, const block_q8_1 * y) const { + return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s)); + } +}; + +struct MinusType0 { + inline __m256 compute(__m128 d, int) const { return _mm256_set_m128(d, d); } + inline float compute(float d, int) const { return d; } + inline float result(__m256 acc, int) const { return hsum_float_8(acc); } +}; + +template struct MinusType1 { + __m128 accm[nrc_y]; + MinusType1() { for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm_setzero_ps(); } + inline __m256 compute(__m256 dm, int iy) { + const __m128 d = _mm256_castps256_ps128(dm); + const __m128 m = _mm256_extractf128_ps(dm, 1); + accm[iy] = _mm_add_ps(accm[iy], m); + return _mm256_set_m128(d, d); + } + inline float compute(const std::pair& dm, int iy) { + accm[iy] = _mm_add_ps(accm[iy], _mm_set1_ps(dm.second*0.25f)); + return dm.first; + } + inline float result(__m256 acc, int iy) const { + const __m128 sum = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); + return hsum_float_4(_mm_add_ps(sum, accm[iy])); + } +}; + +template struct AccumT { + __m256 acc[nrc_y]; + Minus accm; + AccumT() { for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = _mm256_setzero_ps(); } + template + inline void compute(int nb, Unpacker& unp, Scales& scales, Sum& sum, const Q8 ** y, const DataInfo& info, int ix) { + auto qx = unp.quants(); + __m256 dall[nrc_y]; + for (int i = 0; i < nb/4; ++i) { + auto other_scales = unp.set_block_4(i); + for (int iy = 0; iy < nrc_y; ++iy) { + auto s12 = scales.prepare4(other_scales, y[iy] + 4*i); + dall[iy] = accm.compute(s12, iy); + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto pall = sum.compute(qx, y[iy] + 4*i); + acc[iy] = _mm256_fmadd_ps(dall[iy], _mm256_cvtepi32_ps(pall), acc[iy]); + } + } + if (!is_multiple_of_4) { + for (int i = 4*(nb/4); i < nb; ++i) { + auto other_scales = unp.set_block(i); + for (int iy = 0; iy < nrc_y; ++iy) { + auto s12 = scales.prepare1(other_scales, y[iy] + i); + auto d = accm.compute(s12, iy); + const __m256i p0 = sum.dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[iy][i].qs)); + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(p0), acc[iy]); + } + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, accm.result(acc[iy], iy)); + //s[iy*bs] = accm.result(acc[iy], iy); + } + } +}; + +template +using AccumType0 = AccumT; + +template +using AccumType1 = AccumT, nrc_y, is_multiple_of_4>; + +using Sum4Type0 = Sum4; +using Sum4Type1 = Sum4; + +template +void mul_mat_qX_q8_Helper(int nb, const void * vx, size_t bx, const DataInfo& info, const Q8 ** y, int nrc_x) { + Unpacker unp(vx, bx); + Sum4Type sum4; + Scales scales; + for (int ix = 0; ix < nrc_x; ++ix) { + unp.set_row(ix); + AccumType accum; + accum.compute(nb, unp, scales, sum4, y, info, ix); + } +} + +template +void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + +template +void mul_mat_qX_1_q8_1_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_1, block_q8_1, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + +struct Dequantizer4bit { + const __m256i m4 = _mm256_set1_epi8(0xf); + inline __m256i dequant(const uint8_t * qs) const { + const __m128i aux128 = _mm_loadu_si128((const __m128i *)qs); + return _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), m4); + } +}; + +struct Q8_0_Dequantizer { + inline __m256i dequant(const block_q8_0 * x) const { + return _mm256_loadu_si256((const __m256i *)x->qs); + } +}; + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct Q8_0_1_Dequantizer { + inline __m256i dequant(const block_q8_0 * x) const { + return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs)); + } +}; +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8455 + +struct Q4_0_Dequantizer { + Dequantizer4bit b4; + const __m256i m8 = _mm256_set1_epi8(-8); + inline __m256i dequant(const block_q4_0 * x) const { + return _mm256_add_epi8(b4.dequant(x->qs), m8); + } +}; + +struct Q4_1_Dequantizer { + Dequantizer4bit b4; + inline __m256i dequant(const block_q4_1 * x) const { + return b4.dequant(x->qs); + } +}; + +struct HBitDequantizer { + const __m256i shuffle = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); + const __m256i mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + const __m256i minus1 = _mm256_set1_epi64x(-1); + inline __m256i to_bytes(const uint8_t * bits) const { + // Note: Data in all ggml quants is at least 2-byte aligned. + // => we can cast to uint16_t and use or on two consecutive entries + // which is faster than memcpy + const uint16_t * aux16 = (const uint16_t *)bits; + const uint32_t aux32 = aux16[0] | (aux16[1] << 16); + //uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t)); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(aux32), shuffle); + bytes = _mm256_or_si256(bytes, mask); + return _mm256_cmpeq_epi8(bytes, minus1); + } +}; + +struct Q5_0_Dequantizer { + Dequantizer4bit b4; + HBitDequantizer hbit; + const __m256i mh = _mm256_set1_epi8((char)0xF0); + inline __m256i dequant(const block_q5_0 * x) const { + const __m256i vqh = _mm256_andnot_si256(hbit.to_bytes(x->qh), mh); + return _mm256_or_si256(b4.dequant(x->qs), vqh); + } +}; + +struct Q5_1_Dequantizer { + Dequantizer4bit b4; + HBitDequantizer hbit; + const __m256i mh = _mm256_set1_epi8(0x10); + inline __m256i dequant(const block_q5_1 * x) const { + const __m256i vqh = _mm256_and_si256(hbit.to_bytes(x->qh), mh); + return _mm256_or_si256(b4.dequant(x->qs), vqh); + } +}; + +template +struct Q_Unpacker { + Q_Unpacker(const void * vx, size_t bx) : cx_0((const char *)vx), x((const Q*)cx_0), bx(bx) {} + + const char * cx_0; + const Q * x; + size_t bx; + + Scales scales; + Dequantizer deq; + + __m256i qx[4]; + + inline const __m256i* quants() const { return qx; } + + inline void set_row(int ix) { x = (const Q*)(cx_0 + ix*bx); } + + inline auto set_block_4(int i) { + for (int j = 0; j < 4; ++j) { + qx[j] = deq.dequant(x + 4*i + j); + } + return scales.prepare4(x + 4*i); + } + inline auto set_block(int i) { + qx[0] = deq.dequant(x + i); + return scales.prepare1(x + i); + } +}; + +struct Q8_0_Unpacker final : public Q_Unpacker { + Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_0; } +}; +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct Q8_0_1_Unpacker final : public Q_Unpacker, Q8_0_1_Dequantizer> { + Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} +// using Sum4T = Sum4TypeQ81; + inline static int block_size() { return QK8_0; } +}; +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8574 +struct Q4_0_Unpacker final : public Q_Unpacker { + Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_0; } +}; +struct Q5_0_Unpacker final : public Q_Unpacker { + Q5_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK5_0; } +}; +struct Q4_1_Unpacker final : public Q_Unpacker { + Q4_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_1; } +}; +struct Q5_1_Unpacker final : public Q_Unpacker { + Q5_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {} + inline static int block_size() { return QK4_1; } +}; + +template +void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n%Q8_0_Unpacker::block_size() == 0); + Q8 q8(info); + int nb = n/Q8_0_Unpacker::block_size(); + if (nb%4 == 0) { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } else { + mul_mat_qX_q8_Helper, ScaleHelperQ_0, block_q8_0, nrc_y>( + nb, vx, bx, info, q8.y, nrc_x + ); + } +} + + + + +/* +moonll +add some structs for DequantizerIQ2XXS +SimpleBits +EvenSignHelper +*/ +struct SimpleBits { + __m256i values[4]; +}; + +// fix for #829: Add checks of AVX512VPOPCNTDQ +#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__) +#define HAVE_AVX512_POPCNT 1 +#else +#define HAVE_AVX512_POPCNT 0 +#endif + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7736 +// with the addition of a branch that handles a missing _mm256_popcnt_epi32 instruction +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct EvenSignHelper { + #if defined HAVE_FANCY_SIMD + // #pragma message("Using AVX512VPOPCNTDQ in even sign helper") + union sbits_t { + __m128i vec; + __mmask32 mask[4]; + }; + IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const { + aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask); + + // fix for #829: Compatibility with processors using Intel Cascade Lake architecture + // If AVX512VPOPCNTDQ extension is not supported, use alternative implementation + #if HAVE_AVX512_POPCNT + auto pcnt = _mm256_popcnt_epi32(aux); + + #else + // Alternative implementation: Using standard bit counting method + __m256i pcnt; + int* pcnt_ptr = reinterpret_cast(&pcnt); + int* aux_ptr = reinterpret_cast(&aux); // Get address of aux directly, avoid unnecessary copies + + #pragma unroll 8 // Hint compiler to unroll loops, increasing throughput of SIMD computing + for (int i = 0; i < 8; i++) { + pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // Use compiler builtin popcount + } + #endif + + sbits_t sbits; + sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); + values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]); + values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]); + //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); + //const __mmask32 * m32 = (const __mmask32 *)&sign_bits; + //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]); + //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]); + } + const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0); + const __m256i mask = _mm256_set1_epi32(127); + const __m256i mone = _mm256_set1_epi32(1); + #else + inline void sign_value(uint32_t aux32, __m256i& value) const { + auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127], + keven_signs[(aux32 >> 7) & 127], keven_signs[(aux32 >> 0) & 127]); + value = _mm256_sign_epi8(value, signs); + } + #endif +}; + +/* +moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1 +add func +get_scale_shuffle_8 +get_scale_shuffle_16 +set_scales_16 +*/ + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +inline __m256i get_scale_shuffle_8(int i) { + return _mm256_set1_epi16((2*i) | ((2*i+1) << 8)); +} + +inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) { + scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0)); + scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1)); + scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2)); + scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3)); +} + + +inline __m256i get_scale_shuffle_16(int i) { + static const uint8_t k_shuffle[128] = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + return _mm256_loadu_si256((const __m256i*)k_shuffle + i); +} + +inline void set_scales_16(const __m256i& all_scales, __m256i * scales) { + scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0)); + scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1)); + scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2)); + scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3)); +} + +template +inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) { + if (j == 0) { +#ifdef HAVE_FANCY_SIMD + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + } +#else + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3))); + sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4)); + } +#endif + } else { +#ifdef HAVE_FANCY_SIMD + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + } +#else + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4))); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5))); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6))); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7))); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3)); + sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4)); + } +#endif + } +} + +/* +moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1 +add func +set_scales_8_iq +set_scales_16_iq + +add MUL_MAT +mul_mat_qX_K_q8_K_IQ_1 +mul_mat_qX_K_q8_K_IQ_N +mul_mat_qX_K_q8_K_IQ +*/ + +template +inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) { + if (j == 0) { +#ifdef HAVE_FANCY_SIMD + auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); + auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); + auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); + auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); + sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2)); + sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4)); +#else + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); + sumi[0] = _mm256_add_epi32(p1, p3); + sumi[1] = _mm256_add_epi32(p2, p4); +#endif + } else { +#ifdef HAVE_FANCY_SIMD + auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]); + auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]); + auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]); + auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]); + sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2)); + sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4)); +#else + const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0])); + const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1])); + const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2])); + const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3])); + sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3)); + sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4)); +#endif + } +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L1578 + + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) { + //#ifdef HAVE_FANCY_SIMD + auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100) + : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908); + scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); + scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4))); + //#else + // set_scales_8(all_scales, j, scales); + //#endif + } + +inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) { + #ifdef HAVE_FANCY_SIMD + auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100); + scales[0] = _mm256_shuffle_epi8(all_scales, shuffle); + scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8))); + #else + set_scales_16(all_scales, scales); + #endif + } +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7278 + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +template +static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const int nb = n / QK_K; + Q8<1> q8(info); + Dequantizer deq(vx, bx); + __m256i scales[2]; + __m256i q8_quants[4]; + for (int ix = 0; ix < nrc_x; ++ix) { + + __m256 accd = _mm256_setzero_ps(); + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + __m256i sumi[2], all_scales[Dequantizer::num_blocks/8]; + deq.new_block(i, all_scales); + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j, q8, q8_quants); + if constexpr (Dequantizer::num_blocks == 8) { + set_scales_8_iq(j, all_scales[0], scales); + } else { + set_scales_16_iq(all_scales[j], scales); + } + multiply_add_1(j, deq.bits, scales, q8_quants, sumi); + } + accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd); + } + + info.store(ix, 0, hsum_float_8(accd)); + } + } + + +template +static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const int nb = n / QK_K; + Q8 q8(info); + Dequantizer deq(vx, bx); + __m256i scales[4]; + __m256 accd[nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps(); + + deq.new_row(ix); + + for (int i = 0; i < nb; ++i) { + + __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8]; + //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256(); + __m256i mins; + float dmin = deq.new_block(i, all_scales, mins); + for (int iy = 0; iy < nrc_y; ++iy) { + auto bsums = q8.load_bsums(iy, i); + auto prod = _mm256_madd_epi16(mins, bsums); + accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]); + } + + for (int j = 0; j < QK_K/128; ++j) { + deq.prepare(i, j); + if constexpr (Dequantizer::num_blocks == 8) { + set_scales_8(all_scales[0], j, scales); + } else { + set_scales_16(all_scales[j], scales); + } + //multiply_add_iq(deq.bits, scales, j, i, q8, sumi); + multiply_add(deq.bits, scales, j, i, q8, sumi); + } + for (int iy = 0; iy < nrc_y; ++iy) { + const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i)); + accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]); + } + } + + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, hsum_float_8(accd[iy])); + } + } +} + +template +static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); +#ifdef HAVE_FANCY_SIMD + if constexpr (nrc_y == 1) { + mul_mat_qX_K_q8_K_IQ_1(n, vx, bx, info, nrc_x); + } else { + mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); + } +#else + mul_mat_qX_K_q8_K_IQ_N(n, vx, bx, info, nrc_x); +#endif +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L7299 + +/* +moonll iq1s +core func for iq1s mul_mat_iq1_s_q8_K + +*/ +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +template +static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + GGML_ASSERT(n%QK_K == 0); + Q8 q8(info); + __m256i qx[8]; + __m256i scales[4]; + __m256 acc[nrc_y] = {}; + auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000 + __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100); + for (int ix = 0; ix < nrc_x; ++ix) { + auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx); + for (int ibl = 0; ibl < n/QK_K; ++ibl) { + float d = GGML_FP16_TO_FP32(iq1s[ibl].d); + auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh); + auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7)); + scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1)); +#ifdef HAVE_FANCY_SIMD + auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask); + auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9)); +#else + auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask); + auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7))); +#endif + deltas128 = _mm_mullo_epi16(scales128, deltas128); + scales128 = _mm_slli_epi16(scales128, 3); + auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128); + auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128); + auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7 + auto all_scales = MM256_SET_M128I(scales128, scales128); + auto shuffle = shuffle0; + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle); + shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)); + } + const uint8_t * qs = iq1s[ibl].qs; + const uint16_t * qh = iq1s[ibl].qh; + for (int ib = 0; ib < QK_K/32; ib += 2) { + qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)], + iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)], + iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]); + qs += 8; + } + for (int iy = 0; iy < nrc_y; ++iy) { + auto bsums = q8.load_bsums(iy, ibl); + auto sumi = _mm256_setzero_si256(); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0); + auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1); +#ifdef HAVE_FANCY_SIMD + auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1); + auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2); + sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2)); +#else + auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1); + auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2); + auto dot = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2)); + sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot)); +#endif + } +#ifdef HAVE_FANCY_SIMD + sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas); +#else + sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas)); +#endif + acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]); + } + } + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, 0.125f*hsum_float_8(acc[iy])); + acc[iy] = _mm256_setzero_ps(); + } + } +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L3813 + +/* +moonll iq1s +DequantizerIQ2XXS +DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S +*/ + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8035 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct DequantizerIQ2XXS final : public BaseDequantizer { + DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {} + + constexpr static int num_blocks = 8; + + union Data { + __m256i vec; + uint32_t val[8]; + }; + + inline __m128i load_scales(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + const uint16_t * a16 = (const uint16_t *)x[i].qs; + auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12); + return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1)); + } + + inline void new_block(int i, __m256i * scales) { + auto sc16 = load_scales(i); + scales[0] = MM256_SET_M128I(sc16, sc16); + } + inline float new_block(int i, __m256i * scales, __m256i& mins) { + auto sc16 = load_scales(i); + mins = scb.shuffle(sc16); + scales[0] = MM256_SET_M128I(sc16, sc16); + return -d*minv; + } + + inline static void make4(const uint32_t * aux32, __m256i * values) { + const uint8_t * aux8 = (const uint8_t *)aux32; + values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]); + values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]); + values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]); + values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]); + } + + IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const { +#ifdef HAVE_FANCY_SIMD + esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0); + esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2); +#else + esh.sign_value(aux32[1], values[0]); + esh.sign_value(aux32[3], values[1]); + esh.sign_value(aux32[5], values[2]); + esh.sign_value(aux32[7], values[3]); +#endif + } + inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const { + make4(aux32, values); + sign_values(aux32, values); + for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value); + } + inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const { + make4(aux32, values); + sign_values(aux32, q8); + } + inline void prepare(int i, int j) { + Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); + make4_signed(data.val, min_value, bits.values); + } + inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) { + for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k); + Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j); + make4(data.val, bits.values, q8_quants); + } + + constexpr static int minv = 43; + SimpleBits bits; + Scales8KBase scb; + EvenSignHelper esh; + const __m256i min_value = _mm256_set1_epi8(minv); + const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1); +}; + +/* +moonll +add Q8_0_Unpacker && DequantizerIQ2XXS support +add func mul_mat_qX_K_q8_K_IQ +*/ + +// Copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +template void MulMat::set_functions(MulMat& m) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_0_q8_0_T; + m.funcs[1] = mul_mat_qX_0_q8_0_T; + m.funcs[2] = mul_mat_qX_0_q8_0_T; + m.funcs[3] = mul_mat_qX_0_q8_0_T; + m.funcs[4] = mul_mat_qX_0_q8_0_T; + m.funcs[5] = mul_mat_qX_0_q8_0_T; + m.funcs[6] = mul_mat_qX_0_q8_0_T; + m.funcs[7] = mul_mat_qX_0_q8_0_T; + } + else if constexpr (std::is_same_v || std::is_same_v|| std::is_same_v) { + m.funcs[0] = mul_mat_qX_1_q8_1_T; + m.funcs[1] = mul_mat_qX_1_q8_1_T; + m.funcs[2] = mul_mat_qX_1_q8_1_T; + m.funcs[3] = mul_mat_qX_1_q8_1_T; + m.funcs[4] = mul_mat_qX_1_q8_1_T; + m.funcs[5] = mul_mat_qX_1_q8_1_T; + m.funcs[6] = mul_mat_qX_1_q8_1_T; + m.funcs[7] = mul_mat_qX_1_q8_1_T; + } + else if constexpr (std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQ; + m.funcs[1] = mul_mat_qX_K_q8_K_IQ; + m.funcs[2] = mul_mat_qX_K_q8_K_IQ; + m.funcs[3] = mul_mat_qX_K_q8_K_IQ; + m.funcs[4] = mul_mat_qX_K_q8_K_IQ; + m.funcs[5] = mul_mat_qX_K_q8_K_IQ; + m.funcs[6] = mul_mat_qX_K_q8_K_IQ; + m.funcs[7] = mul_mat_qX_K_q8_K_IQ; + } + else { +#ifdef HAVE_FANCY_SIMD + if constexpr (std::is_same_v) { + m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512; + m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512; + } else { + m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1; + m.funcs[1] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[2] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[3] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[4] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[5] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[6] = mul_mat_qX_K_q8_K_AVX512; + m.funcs[7] = mul_mat_qX_K_q8_K_AVX512; + } +#else + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qY_K_q8_K_T; + m.funcs[1] = mul_mat_qY_K_q8_K_T; + m.funcs[2] = mul_mat_qY_K_q8_K_T; + m.funcs[3] = mul_mat_qY_K_q8_K_T; + m.funcs[4] = mul_mat_qY_K_q8_K_T; + m.funcs[5] = mul_mat_qY_K_q8_K_T; + m.funcs[6] = mul_mat_qY_K_q8_K_T; + m.funcs[7] = mul_mat_qY_K_q8_K_T; + } else { + m.funcs[0] = mul_mat_qX_K_q8_K_T; + m.funcs[1] = mul_mat_qX_K_q8_K_T; + m.funcs[2] = mul_mat_qX_K_q8_K_T; + m.funcs[3] = mul_mat_qX_K_q8_K_T; + m.funcs[4] = mul_mat_qX_K_q8_K_T; + m.funcs[5] = mul_mat_qX_K_q8_K_T; + m.funcs[6] = mul_mat_qX_K_q8_K_T; + m.funcs[7] = mul_mat_qX_K_q8_K_T; + } +#endif + } +} +// end copied/adapted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9092 + +// Copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +struct QFBase { + #ifdef __AVX512F__ + constexpr static int k_step = 16; + using Data = __m512; + using Acc = __m512; + static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); } + static inline Data load(const float * x) { return _mm512_loadu_ps(x); } + static inline Data load(const ggml_bf16_t * x) { + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16)); + } + static inline Acc acc(Acc prev, const Data& y, const Data& x) { + return _mm512_fmadd_ps(y, x, prev); + } + static inline Acc acc_first(const Data& y, const Data& x) { + return _mm512_mul_ps(y, x); + } + static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); } + static inline float hsum(Acc acc) { + return _mm512_reduce_add_ps(acc); + } + template + static inline Data load4Floats(const Float * x) { + return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0); + } + static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { + acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc); + acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_r4_first(const Data * xv, const Data& yv) { + auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00)); + acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline __m128 hsum_r4(Acc acc) { + auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1)); + auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3)); + return _mm_add_ps(sum1, sum2); + } + #else + constexpr static int k_step = 8; + using Data = __m256; + using Acc = __m256; + static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); } + static inline Data load(const float * x) { return _mm256_loadu_ps(x); } + static inline Data load(const ggml_bf16_t * x) { + return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16)); + } + static inline Acc acc(Acc prev, const Data& y, const Data& x) { + return _mm256_fmadd_ps(y, x, prev); + } + static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); } + static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) { + acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc); + acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_r4_first(const Data * xv, const Data& yv) { + auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00)); + acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc); + acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc); + acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc); + return acc; + } + static inline Acc acc_first(const Data& y, const Data& x) { + return _mm256_mul_ps(y, x); + } + static inline float hsum(Acc acc) { + return hsum_float_8(acc); + } + static inline __m128 hsum_r4(Acc acc) { + return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1)); + } + template + static inline Data load4Floats(const Float * x) { + return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0); + } + #endif + static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); } + static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); } + static inline __m128 load128(const ggml_bf16_t * x) { + return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16)); + } + }; + template struct QFT final : public QFBase { + constexpr static int nrc = nrc_in; + QFT(const DataInfo& info) { + for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy); + } + QFT(const char * cx, size_t bx) { + for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx); + } + IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); } + IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); } + IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const { + xv[0] = load1(ix+0, i); + xv[1] = load1(ix+1, i); + xv[2] = load1(ix+2, i); + xv[3] = load1(ix+3, i); + #ifdef __AVX512F__ + auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]); + auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]); + auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]); + auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]); + xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); + xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1))); + xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); + xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3))); + #else + auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]); + auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]); + auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]); + auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]); + xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); + xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1))); + xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); + xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3))); + #endif + } + const Float * y[nrc]; + }; + + + +template +IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) { + int nb = n/QFBase::k_step; + int nb4 = n/4; + Qy y(info); + Qx x(cx + ix0*bx, bx); + QFBase::Data xv[Qx::nrc]; + QFBase::Acc acc[Qx::nrc*Qy::nrc]; + auto yv = y.load1(0, 0); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load1(ix, 0); + acc[ix] = QFBase::acc_first(yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load1(iy, 0); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]); + } + for (int i = 1; i < nb; ++i) { + yv = y.load1(0, i); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load1(ix, i); + acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load1(iy, i); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); + } + } + for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) { + yv = y.load_tail(0, i); + for (int ix = 0; ix < Qx::nrc; ++ix) { + xv[ix] = x.load_tail(ix, i); + acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]); + } + for (int iy = 1; iy < Qy::nrc; ++iy) { + yv = y.load_tail(iy, i); + for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]); + } + } + for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix])); +} +// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done +// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in +// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now. +template +void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + const char * cx = (const char *)vx; + // TBD if we want this + //if constexpr (nrc_y == 1) { + // constexpr int k_nx = 2; + // for (int ix = 0; ix < nrc_x/k_nx; ++ix) { + // mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, ix*k_nx, info); + // } + // if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) { + // int nx = nrc_x - lastx; + // switch (nx) { + // case 1: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // case 2: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // case 3: mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); break; + // } + // //mul_mat_Qx_Qy_Mx1, QFT>(n, cx, bx, lastx, info); + // } + // return; + //} +#ifdef __AVX512F__ + constexpr int k_nx = 5; +#else + constexpr int k_nx = nrc_y == 1 ? 4 : 2; +#endif + for (int ix = 0; ix < nrc_x/k_nx; ++ix) { + mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, ix*k_nx, info); + } + int last_x = k_nx*(nrc_x/k_nx); + if (last_x == nrc_x) return; + int nx = nrc_x - last_x; +#ifdef __AVX512F__ + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 4: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } +#else + if constexpr (nrc_y == 1) { + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 2: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + case 3: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } + } else { + switch (nx) { + case 1: mul_mat_Qx_Qy_MxN, QFT>(n, cx, bx, last_x, info); break; + } + } +#endif +} + +template +void set_mul_mat_f(MulMat& mm) { + for (auto& f : mm.funcs) f = nullptr; + mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>; + mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>; + mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>; + mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>; + mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>; +#ifndef __AVX512F__ + mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>; +#endif +} +// end copied from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L8622 + +/* +moonll +add typeb TO compare return not expected type of weight matrix +add IQ2XSS +add IQ1_S +add GGML_TYPE_IQ4_XS +*/ + +// Modifications extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231 +// MIT licensed, Copyright (c) 2024-2025 Iwan Kawrakow +bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) { + (void)Ny; + + auto expected_typeB = GGML_TYPE_Q8_K; + switch (typeA) { + case GGML_TYPE_Q2_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q3_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q4_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q5_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q6_K: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_IQ4_XS: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_IQ2_XXS: + assert (ne00 % QK_K == 0); + MulMat::set_functions(mm); + break; + case GGML_TYPE_Q4_0: + assert (ne00 % QK4_0 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0; + break; + case GGML_TYPE_Q4_1: + assert (ne00 % QK4_1 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; + break; + case GGML_TYPE_Q5_0: + assert (ne00 % QK5_0 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0; + break; + case GGML_TYPE_Q5_1: + assert (ne00 % QK5_1 == 0); + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; + break; + case GGML_TYPE_Q8_0: + assert (ne00 % QK8_0 == 0); +#ifdef HAVE_FANCY_SIMD + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_1_X4; +#else + MulMat::set_functions(mm); + expected_typeB = GGML_TYPE_Q8_0_X4; +#endif + break; + case GGML_TYPE_IQ1_S: + mm.funcs[0] = mul_mat_iq1_s_q8_K<1>; + mm.funcs[1] = mul_mat_iq1_s_q8_K<2>; + mm.funcs[2] = mul_mat_iq1_s_q8_K<3>; + mm.funcs[3] = mul_mat_iq1_s_q8_K<4>; + mm.funcs[4] = mul_mat_iq1_s_q8_K<5>; + mm.funcs[5] = mul_mat_iq1_s_q8_K<6>; + mm.funcs[6] = mul_mat_iq1_s_q8_K<7>; + mm.funcs[7] = mul_mat_iq1_s_q8_K<8>; + #ifdef HAVE_FANCY_SIMD + mm.func16 = mul_mat_iq1_s_q8_K<16>; + #endif + // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + expected_typeB = GGML_TYPE_Q8_K; + break; + + default: + { + // printf("case:%d",typeA); + return false; + } + + } + + + + return ggml_type(typeB) == expected_typeB; + +} +// end extracted from https://github.com/ikawrakow/ik_llama.cpp/blob/474435f58b6a26bc549589966482207fee94aa60/ggml/src/iqk/iqk_mul_mat.cpp#L9231 + +} // namespace + +/* +iq1_s is not support for arm +*/ +#else // __aarch64__ + +namespace { + +template struct Q8 { + + constexpr static int nrc_y = nrc; + + Q8(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8 *)info.src1_row(iy); + } + + inline int8x16_t load_quants_16(int iy, int i, int j) const { return vld1q_s8(y[iy][i].qs + 16*j); } + inline int8x16x2_t load_quants(int iy, int i, int j) const { return vld1q_s8_x2(y[iy][i].qs + 32*j); } + inline int8x16x4_t load_quants_64(int iy, int i, int j) const { return vld1q_s8_x4(y[iy][i].qs + 64*j); } + inline int16x8x2_t load_bsums(int iy, int i) const { return vld1q_s16_x2(y[iy][i].bsums); } + inline int16x8_t load_bsums8(int iy, int i) const { + auto q8s = vld1q_s16_x2(y[iy][i].bsums); + return vpaddq_s16(q8s.val[0], q8s.val[1]); + } + inline float scale(int iy, int i) const { return y[iy][i].d; } + + const block_q8 * y[nrc_y]; +}; + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx, nrc_y); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + +//#pragma GCC unroll 4 + for (int i = 0; i < nb; ++i) { + + int32x4_t sumi[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); + + if constexpr (nrc_y > 1 && Dequantizer::should_scale_quants()) { + deq.process_scales(i, q8, acc); + deq.prepare(i, 0); + deq.compute(q8, i, 0, sumi); + deq.prepare(i, 1); + deq.compute(q8, i, 1, sumi); + } else { + if constexpr (Dequantizer::num_blocks() == 8) { + auto scales = deq.new_block(i, q8, acc); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else if constexpr (Dequantizer::num_blocks() == 16) { + auto scales = deq.new_block(i, q8, acc); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else { + GGML_ASSERT(false); + } + } + +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); + } + } + +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + + Dequantizer deq(vx, bx, nrc_y); + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb; ++i) { + + int32x4_t sumi[nrc_y]; + for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = vdupq_n_s32(0); + + if constexpr (Dequantizer::num_blocks() == 8) { + auto scales = deq.new_block(i); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_8_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else if constexpr (Dequantizer::num_blocks() == 16) { + auto scales = deq.new_block(i); + deq.prepare(i, 0); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 0, sumi[iy]); + deq.prepare(i, 1); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) compute_16_blocks(deq.bits.b1, deq.bits.b2, q8, scales, iy, i, 1, sumi[iy]); + } + else { + GGML_ASSERT(false); + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + acc[iy] = vmlaq_f32(acc[iy], vcvtq_f32_s32(sumi[iy]), vdupq_n_f32(deq.d*q8.scale(iy, i))); + } + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +template +IQK_ALWAYS_INLINE void compute_8_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, + const int32x4x2_t& scales, int iy, int i, int j, int32x4_t& sumi) { + auto mzero = vdupq_n_s32(0); + const int8x16_t * qs_1 = (const int8x16_t *)qx_1.val; + const int8x16_t * qs_2 = (const int8x16_t *)qx_2.val; + + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[0], q8b_1.val[0]), qs_1[1], q8b_1.val[1]); // block 1 + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_1[2], q8b_2.val[0]), qs_1[3], q8b_2.val[1]); // block 2 + auto p12 = vpaddq_s32(p1, p2); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[0], q8b_3.val[0]), qs_2[1], q8b_3.val[1]); // block 3 + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qs_2[2], q8b_4.val[0]), qs_2[3], q8b_4.val[1]); // block 4 + auto p34 = vpaddq_s32(p3, p4); + + auto pall = vpaddq_s32(p12, p34); + sumi = vmlaq_s32(sumi, scales.val[j], pall); +} +template +IQK_ALWAYS_INLINE void compute_8_blocks(const int8x16_t * qx, const Q8& q8, + const int32x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { + auto mzero = vdupq_n_s32(0); + + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[0], q8b_1.val[0]), qx[1], q8b_1.val[1]); // block 1 + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[2], q8b_2.val[0]), qx[3], q8b_2.val[1]); // block 2 + auto p12 = vpaddq_s32(p1, p2); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[4], q8b_3.val[0]), qx[5], q8b_3.val[1]); // block 3 + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, qx[6], q8b_4.val[0]), qx[7], q8b_4.val[1]); // block 4 + auto p34 = vpaddq_s32(p3, p4); + + auto pall = vpaddq_s32(p12, p34); + sumi = vmlaq_s32(sumi, scales, pall); +} + +template +IQK_ALWAYS_INLINE void compute_16_blocks(const uint8x16x4_t& qx_1, const uint8x16x4_t& qx_2, const Q8& q8, + const int32x4x4_t& scales, int iy, int i, int j, int32x4_t& sumi) { + + auto mzero = vdupq_n_s32(0); + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + auto p1 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[0]), q8b_1.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[1]), q8b_1.val[1])); // blocks 0, 0, 1, 1, + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + auto p2 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[2]), q8b_2.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_1.val[3]), q8b_2.val[1])); // blocks 3, 3, 4, 4, + auto p12 = vpaddq_s32(p1, p2); // blocks 0, 1, 2, 3 + sumi = vmlaq_s32(sumi, scales.val[2*j+0], p12); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + auto p3 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[0]), q8b_3.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[1]), q8b_3.val[1])); // block 4, 4, 5, 5, + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + auto p4 = vpaddq_s32(ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[2]), q8b_4.val[0]), + ggml_vdotq_s32(mzero, vreinterpretq_s8_u8(qx_2.val[3]), q8b_4.val[1])); // block 6, 6, 7, 7, + auto p34 = vpaddq_s32(p3, p4); // blocks 4, 5, 6, 7 + sumi = vmlaq_s32(sumi, scales.val[2*j+1], p34); +} + +template +inline void accum_mins_8(const int16x8_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8s = q8.load_bsums8(iy, i); + int32x4_t b1 = vmull_s16(vget_low_s16(mins), vget_low_s16(q8s)); + int32x4_t b2 = vmull_s16(vget_high_s16(mins), vget_high_s16(q8s)); + float32x4_t prod = vcvtq_f32_s32(vaddq_s32(b1, b2)); + acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); + } +} +template +inline void accum_mins_16(const int16x8x2_t& mins, const Q8& q8, float32x4_t * acc, int i, float c) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8s = q8.load_bsums(iy, i); + int32x4_t b1 = vmull_s16(vget_low_s16 (mins.val[0]), vget_low_s16 (q8s.val[0])); + int32x4_t b2 = vmull_s16(vget_high_s16(mins.val[0]), vget_high_s16(q8s.val[0])); + int32x4_t b3 = vmull_s16(vget_low_s16 (mins.val[1]), vget_low_s16 (q8s.val[1])); + int32x4_t b4 = vmull_s16(vget_high_s16(mins.val[1]), vget_high_s16(q8s.val[1])); + float32x4_t prod = vcvtq_f32_s32(vaddq_s32(vaddq_s32(b1, b2), vaddq_s32(b3, b4))); + acc[iy] = vmlaq_f32(acc[iy], prod, vdupq_n_f32(c*q8.scale(iy, i))); + } +} + +struct Scales8 { + uint32_t utmp[4]; + const uint8_t * sc8 = (const uint8_t *)utmp; + template + inline int32x4x2_t process_scales_mins(const Qx& x, const Q8& q8, int i, float32x4_t * acc) { + make_q4_scales(x.scales, utmp); + int16x8_t mins = vmovl_s8(vld1_s8((const int8_t *)sc8 + 8)); + accum_mins_8(mins, q8, acc, i, -GGML_FP16_TO_FP32(x.dmin)); + + uint8x8_t scales8 = vld1_u8(sc8); + uint16x8_t scales16 = vmovl_u8(scales8); + int32x4x2_t scales = {vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales16)))}; + return scales; + } +}; + +struct Q4bits { + const uint8x16_t m4b = vdupq_n_u8(0xf); + uint8x16x4_t b1, b2; + inline void prepare4(uint8x16x4_t& b, const uint8x16_t * val) const { + b.val[0] = vandq_u8(val[0], m4b); + b.val[2] = vshrq_n_u8(val[0], 4); + b.val[1] = vandq_u8(val[1], m4b); + b.val[3] = vshrq_n_u8(val[1], 4); + } + inline void prepare4_16(uint8x16x4_t& b, const uint8x16_t * val) const { + b.val[0] = vandq_u8(val[0], m4b); + b.val[1] = vshrq_n_u8(val[0], 4); + b.val[2] = vandq_u8(val[1], m4b); + b.val[3] = vshrq_n_u8(val[1], 4); + } + inline void prepare(const uint8_t * qs) { + auto q4bits = vld1q_u8_x2(qs); + prepare4(b1, q4bits.val); + q4bits = vld1q_u8_x2(qs+32); + prepare4(b2, q4bits.val); + } + inline void prepare_v2(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + prepare4(b1, q4bits.val+0); + prepare4(b2, q4bits.val+2); + } + inline void prepare64(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + b1.val[0] = vandq_u8(q4bits.val[0], m4b); + b1.val[1] = vandq_u8(q4bits.val[1], m4b); + b1.val[2] = vandq_u8(q4bits.val[2], m4b); + b1.val[3] = vandq_u8(q4bits.val[3], m4b); + b2.val[0] = vshrq_n_u8(q4bits.val[0], 4); + b2.val[1] = vshrq_n_u8(q4bits.val[1], 4); + b2.val[2] = vshrq_n_u8(q4bits.val[2], 4); + b2.val[3] = vshrq_n_u8(q4bits.val[3], 4); + } + inline void prepare16(const uint8_t * qs) { + auto q4bits = vld1q_u8_x2(qs); + prepare4_16(b1, q4bits.val); + q4bits = vld1q_u8_x2(qs+32); + prepare4_16(b2, q4bits.val); + } + inline void prepare16_v2(const uint8_t * qs) { + auto q4bits = vld1q_u8_x4(qs); + prepare4_16(b1, q4bits.val+0); + prepare4_16(b2, q4bits.val+2); + } +}; + +struct Q2bits { + const uint8x16_t m4b = vdupq_n_u8(0x03); + uint8x16x4_t b1, b2; + inline void prepare(const uint8_t * qs) { + auto q2bits = vld1q_u8_x2(qs); + b1.val[0] = vandq_u8(q2bits.val[0], m4b); + b1.val[1] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b1.val[2] = vandq_u8(q2bits.val[0], m4b); + b1.val[3] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b2.val[0] = vandq_u8(q2bits.val[0], m4b); + b2.val[1] = vandq_u8(q2bits.val[1], m4b); + + q2bits.val[0] = vshrq_n_u8(q2bits.val[0], 2); + q2bits.val[1] = vshrq_n_u8(q2bits.val[1], 2); + b2.val[2] = vandq_u8(q2bits.val[0], m4b); + b2.val[3] = vandq_u8(q2bits.val[1], m4b); + } +}; + +template +struct BaseDequantizer { + BaseDequantizer(const void * vx, size_t bx, int nrc) : vx(vx), x(nullptr), bx(bx), nrc(nrc) {} + inline void new_row(int ix) { x = (const block_q *)((const char *)vx + ix*bx); } + const void * vx; + const block_q * x; + const size_t bx; + const int nrc; +}; + +struct DequantizerQ4K final : public BaseDequantizer { + DequantizerQ4K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + return s8.process_scales_mins(x[i], q8, i, acc); + } + inline void prepare(int i, int j) { + if (nrc == 1) bits.prepare_v2(x[i].qs+64*j); + else bits.prepare(x[i].qs+64*j); + } + + Q4bits bits; + Scales8 s8; + + float d; +}; + +struct HighBit5 { + const uint8x16_t mhb = vdupq_n_u8(0x10); + uint8x16x2_t bits; + inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { + b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 4), mhb)); + b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 4), mhb)); + b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 3), mhb)); + b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 3), mhb)); + + b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); + b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); + b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); + b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); + + if (do_shift) { + bits.val[0] = vshrq_n_u8(bits.val[0], 4); + bits.val[1] = vshrq_n_u8(bits.val[1], 4); + } + } +}; + +struct HighBit3 { + const uint8x16_t mhb = vdupq_n_u8(0x04); + uint8x16x2_t bits; + inline void apply(uint8x16x4_t& b1, uint8x16x4_t& b2, bool do_shift) { + b1.val[0] = vorrq_u8(b1.val[0], vandq_u8(vshlq_n_u8(bits.val[0], 2), mhb)); + b1.val[1] = vorrq_u8(b1.val[1], vandq_u8(vshlq_n_u8(bits.val[1], 2), mhb)); + b1.val[2] = vorrq_u8(b1.val[2], vandq_u8(vshlq_n_u8(bits.val[0], 1), mhb)); + b1.val[3] = vorrq_u8(b1.val[3], vandq_u8(vshlq_n_u8(bits.val[1], 1), mhb)); + + b2.val[0] = vorrq_u8(b2.val[0], vandq_u8(bits.val[0], mhb)); + b2.val[1] = vorrq_u8(b2.val[1], vandq_u8(bits.val[1], mhb)); + b2.val[2] = vorrq_u8(b2.val[2], vandq_u8(vshrq_n_u8(bits.val[0], 1), mhb)); + b2.val[3] = vorrq_u8(b2.val[3], vandq_u8(vshrq_n_u8(bits.val[1], 1), mhb)); + + if (do_shift) { + bits.val[0] = vshrq_n_u8(bits.val[0], 4); + bits.val[1] = vshrq_n_u8(bits.val[1], 4); + } + } +}; + +struct DequantizerQ5K final : public BaseDequantizer { + DequantizerQ5K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + h.bits = vld1q_u8_x2(x[i].qh); + return s8.process_scales_mins(x[i], q8, i, acc); + } + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+64*j); + h.apply(bits.b1, bits.b2, j == 0); + } + + Q4bits bits; + HighBit5 h; + Scales8 s8; + + uint8x16x2_t hbits; + + float d; +}; + +inline int32x4x4_t make_wider(const int16x8x2_t& scales16) { + int32x4x4_t scales = { + vmovl_s16(vget_low_s16 (scales16.val[0])), + vmovl_s16(vget_high_s16(scales16.val[0])), + vmovl_s16(vget_low_s16 (scales16.val[1])), + vmovl_s16(vget_high_s16(scales16.val[1])), + }; + return scales; +} + +template +inline int32x4x4_t process_scales_mins_16(const int8x16_t& scales8, const Q8& q8, float32x4_t * acc, int i, float c) { + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(scales8)); + scales16.val[1] = vmovl_s8(vget_high_s8(scales8)); + accum_mins_16(scales16, q8, acc, i, c); + return make_wider(scales16); +} + +struct DequantizerQ6K final : public BaseDequantizer { + DequantizerQ6K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + return process_scales_mins_16(vld1q_s8(x[i].scales), q8, acc, i, -32.f*d); + } + inline void prepare(int i, int j) { + + auto hbits = vld1q_u8_x2(x[i].qh + 32*j); + + bits.prepare64(x[i].ql+64*j); + bits.b1.val[0] = vorrq_u8(bits.b1.val[0], vandq_u8(vshlq_n_u8(hbits.val[0], 4), mhb)); + bits.b1.val[1] = vorrq_u8(bits.b1.val[1], vandq_u8(vshlq_n_u8(hbits.val[1], 4), mhb)); + bits.b1.val[2] = vorrq_u8(bits.b1.val[2], vandq_u8(vshlq_n_u8(hbits.val[0], 2), mhb)); + bits.b1.val[3] = vorrq_u8(bits.b1.val[3], vandq_u8(vshlq_n_u8(hbits.val[1], 2), mhb)); + + bits.b2.val[0] = vorrq_u8(bits.b2.val[0], vandq_u8(hbits.val[0], mhb)); + bits.b2.val[1] = vorrq_u8(bits.b2.val[1], vandq_u8(hbits.val[1], mhb)); + bits.b2.val[2] = vorrq_u8(bits.b2.val[2], vandq_u8(vshrq_n_u8(hbits.val[0], 2), mhb)); + bits.b2.val[3] = vorrq_u8(bits.b2.val[3], vandq_u8(vshrq_n_u8(hbits.val[1], 2), mhb)); + + } + + Q4bits bits; + + const uint8x16_t mhb = vdupq_n_u8(0x30); + + float d; +}; + +struct DequantizerQ3K final : public BaseDequantizer { + DequantizerQ3K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + h.bits = vld1q_u8_x2(x[i].hmask); + const uint16_t * sc16 = (const uint16_t *)x[i].scales; + uint32_t aux0 = sc16[0] | (sc16[1] << 16); + uint32_t aux1 = sc16[2] | (sc16[3] << 16); + uint32_t aux2 = sc16[4] | (sc16[5] << 16); + aux32[0] = (aux0 & 0x0f0f0f0f) | ((aux2 << 4) & 0x30303030); + aux32[1] = (aux1 & 0x0f0f0f0f) | ((aux2 << 2) & 0x30303030); + aux32[2] = ((aux0 >> 4) & 0x0f0f0f0f) | ((aux2 >> 0) & 0x30303030); + aux32[3] = ((aux1 >> 4) & 0x0f0f0f0f) | ((aux2 >> 2) & 0x30303030); + return process_scales_mins_16(vaddq_s8(vld1q_s8((const int8_t *)aux32), vdupq_n_s8(-32)), q8, acc, i, -4.f*d); + } + + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+32*j); + h.apply(bits.b1, bits.b2, j == 0); + } + + uint32_t aux32[4]; + + Q2bits bits; + + HighBit3 h; + + float d; +}; + +struct DequantizerQ2K final : public BaseDequantizer { + DequantizerQ2K(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return true; } + + template + inline void process_scales(int i, const Q8& q8, float32x4_t * acc) { + d = GGML_FP16_TO_FP32(x[i].d); + auto scales_and_mins = vld1q_u8(x[i].scales); + auto mins8 = vreinterpretq_s8_u8(vshrq_n_u8(scales_and_mins, 4)); + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(mins8)); + scales16.val[1] = vmovl_s8(vget_high_s8(mins8)); + accum_mins_16(scales16, q8, acc, i, -GGML_FP16_TO_FP32(x[i].dmin)); + + scales8 = vandq_u8(scales_and_mins, vdupq_n_u8(0xf)); + } + + template + inline int32x4x4_t new_block(int i, const Q8& q8, float32x4_t * acc) { + process_scales(i, q8, acc); + int16x8x2_t scales16; + scales16.val[0] = vmovl_s8(vget_low_s8(vreinterpretq_s8_u8(scales8))); + scales16.val[1] = vmovl_s8(vget_high_s8(vreinterpretq_s8_u8(scales8))); + return make_wider(scales16); + } + + template + inline void compute(const Q8& q8, int i, int j, int32x4_t * sumi) { + auto m1 = vdupq_n_u8(1); + auto shuffle = vdupq_n_u8(8*j); + bits.b1.val[0] = vmulq_u8(bits.b1.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[1] = vmulq_u8(bits.b1.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[2] = vmulq_u8(bits.b1.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b1.val[3] = vmulq_u8(bits.b1.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[0] = vmulq_u8(bits.b2.val[0], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[1] = vmulq_u8(bits.b2.val[1], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[2] = vmulq_u8(bits.b2.val[2], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + bits.b2.val[3] = vmulq_u8(bits.b2.val[3], vqtbl1q_u8(scales8, shuffle)); shuffle = vaddq_u8(shuffle, m1); + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto q8b_1 = q8.load_quants(iy, i, 4*j+0); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[0]), q8b_1.val[0]), + vreinterpretq_s8_u8(bits.b1.val[1]), q8b_1.val[1]); + + auto q8b_2 = q8.load_quants(iy, i, 4*j+1); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b1.val[2]), q8b_2.val[0]), + vreinterpretq_s8_u8(bits.b1.val[3]), q8b_2.val[1]); + + auto q8b_3 = q8.load_quants(iy, i, 4*j+2); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[0]), q8b_3.val[0]), + vreinterpretq_s8_u8(bits.b2.val[1]), q8b_3.val[1]); + + auto q8b_4 = q8.load_quants(iy, i, 4*j+3); + sumi[iy] = ggml_vdotq_s32(ggml_vdotq_s32(sumi[iy], vreinterpretq_s8_u8(bits.b2.val[2]), q8b_4.val[0]), + vreinterpretq_s8_u8(bits.b2.val[3]), q8b_4.val[1]); + } + } + + inline void prepare(int i, int j) { + bits.prepare(x[i].qs+32*j); + } + + uint32_t aux32[4]; + + uint8x16_t scales8; + + Q2bits bits; + + float d; +}; + +// ============================= i-quants + +struct DequantizerIQ4XS final : public BaseDequantizer { + + static int8x16_t load_values() { + static const int8_t iq4nl_values[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + return vld1q_s8(iq4nl_values); + } + + DequantizerIQ4XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc), values(load_values()) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + inline void new_row(int ix) { x = (const block_iq4_xs *)((const char *)vx + bx*ix); } + + template + inline int32x4x2_t new_block(int i, const Q8& q8, float32x4_t * acc) { + (void)q8; + (void)acc; + d = GGML_FP16_TO_FP32(x[i].d); + const uint16_t scales_h = x[i].scales_h; + const uint16_t * scales_l = (const uint16_t *)x[i].scales_l; + aux32[0] = scales_l[0] | (scales_l[1] << 16); + aux32[1] = aux32[0] >> 4; + // scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7 + uint8x8_t scl8 = vand_u8(vld1_u8((const uint8_t *)aux32), vdup_n_u8(0xf)); + uint16_t * aux16 = (uint16_t *)aux32; + aux16[0] = scales_h << 4; aux16[1] = scales_h << 2; aux16[2] = scales_h; aux16[3] = scales_h >> 2; + // sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7 + uint8x8_t sch8 = vand_u8(vld1_u8((const uint8_t *)aux16), vdup_n_u8(0x30)); + int8x8_t scales8 = vadd_s8(vreinterpret_s8_u8(vorr_u8(scl8, vtbl1_u8(sch8, vreinterpret_u8_u32(hshuff)))), vdup_n_s8(-32)); + // shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7 + scales8 = vtbl1_s8(scales8, vreinterpret_s8_u32(hshuff)); + int16x8_t scales16 = vmovl_s8(scales8); + int32x4x2_t scales = {vmovl_s16(vget_low_s16(scales16)), vmovl_s16(vget_high_s16(scales16))}; + return scales; + } + inline void prepare(int i, int j) { + bits.prepare16(x[i].qs+64*j); + for (int k = 0; k < 4; ++k) { + bits.b1.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b1.val[k])); + bits.b2.val[k] = vreinterpretq_u8_s8(vqtbl1q_s8(values, bits.b2.val[k])); + } + } + + Q4bits bits; + const int8x16_t values; + uint32_t aux32[2]; + + constexpr static uint32x2_t hshuff = {0x05010400, 0x07030602}; + + float d; +}; + +struct SimpleBits { + uint8x16x4_t b1; + uint8x16x4_t b2; +}; + +IQK_ALWAYS_INLINE int32x4x2_t prepare_scales_8(const uint32x4_t& v1, const uint32x4_t& v2) { + int32x4x2_t scales; + auto one = vdupq_n_u32(1); + scales.val[0] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v1, 28), 1)); + scales.val[1] = vreinterpretq_s32_u32(vsliq_n_u32(one, vshrq_n_u32(v2, 28), 1)); + return scales; +} + +inline void apply_signs_2(uint8x16_t * b, const uint64_t * signs, uint32_t sidx) { + auto s1 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >> 0) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >> 7) & 127)))); + auto s2 = vcombine_s8(vld1_s8((const int8_t *)(signs + ((sidx >>14) & 127))), vld1_s8((const int8_t *)(signs + ((sidx >>21) & 127)))); + b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s1)); + b[1] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[1]), s2)); +} + +IQK_ALWAYS_INLINE int32x4_t prepare_scales_8(const uint32x4_t& v1) { + return vreinterpretq_s32_u32(vsliq_n_u32(vdupq_n_u32(1), vshrq_n_u32(v1, 28), 1)); +} + +struct DequantizerIQ2XXS final : public BaseDequantizer { + DequantizerIQ2XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + IQK_ALWAYS_INLINE float new_block(int i) const { return 0.125f * GGML_FP16_TO_FP32(x[i].d); } + + inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { + auto data = vld1q_u32_x2((const uint32_t *)(x[i].qs + 16*j)); + prepare_all(data, q); + return prepare_scales_8(vuzp2q_u32(data.val[0], data.val[1])); + } + +private: + + static inline void prepare2(uint8x16_t * b, const uint32_t * bits, const uint64_t * signs) { + const uint8_t * idx = (const uint8_t *)bits; + b[0] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[0]], iq2xxs_grid[idx[1]]}); + b[1] = vreinterpretq_u8_u64(uint64x2_t{iq2xxs_grid[idx[2]], iq2xxs_grid[idx[3]]}); + apply_signs_2(b, signs, bits[1]); + } + + inline static void prepare_all(const uint32x4x2_t& data, uint8x16_t * quants) { + const uint32_t * q2 = (const uint32_t *)data.val; + prepare2(quants+0, q2+0, keven_signs); + prepare2(quants+2, q2+2, keven_signs); + prepare2(quants+4, q2+4, keven_signs); + prepare2(quants+6, q2+6, keven_signs); + } +}; + +inline int32x4x4_t prepare_4bit_scales16(const uint8_t * sc) { + auto aux = vld1_u8(sc); + auto scales_l = vand_u8(aux, vdup_n_u8(0xf)); + auto scales_h = vshr_n_u8(aux, 4); + auto aux1 = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + + auto scales8 = vreinterpretq_s8_u8(vorrq_u8(vshlq_n_u8(aux1, 1), vdupq_n_u8(1))); + int16x8x2_t scales16 = { vmovl_s8(vget_low_s8(scales8)), vmovl_s8(vget_high_s8(scales8)) }; + return make_wider(scales16); +} + +struct DequantizerIQ2XS final : public BaseDequantizer { + DequantizerIQ2XS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x4_t new_block(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + prepare_internal(i, 0); + return prepare_4bit_scales16(x[i].scales); + } + + inline void prepare(int i, int j) { + if (j == 1) prepare_internal(i, 1); + } + +private: + + static void make2(const uint16_t * qs, uint8x16_t * b) { + auto v1 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[0] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[1] & 511)))); + auto v2 = vcombine_s8(vld1_s8((const int8_t *)(iq2xs_grid + (qs[2] & 511))), vld1_s8((const int8_t *)(iq2xs_grid + (qs[3] & 511)))); + auto s1 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[0] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[1] >> 9)))); + auto s2 = vcombine_s8(vld1_s8((const int8_t *)(keven_signs + (qs[2] >> 9))), vld1_s8((const int8_t *)(keven_signs + (qs[3] >> 9)))); + b[0] = vreinterpretq_u8_s8(vmulq_s8(v1, s1)); + b[1] = vreinterpretq_u8_s8(vmulq_s8(v2, s2)); + } + + inline static void make4(const uint16_t * qs, uint8x16_t * b) { + make2(qs + 0, b + 0); + make2(qs + 4, b + 2); + } + + IQK_ALWAYS_INLINE void prepare_internal(int i, int j) { + make4(x[i].qs + 16*j + 0, bits.b1.val); + make4(x[i].qs + 16*j + 8, bits.b2.val); + } + +}; + +// So, I hate to include this table, but with the GCC 12.3 compiler +// bundled in the Cosmopolitan tools, loading the unpacked sign bytes +// from this table using the packed 8 sign bits as index is faster than +// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to +// expand the bits to bytes. +static const uint64_t kall_signs[256] = { + 0x0101010101010101, 0x01010101010101ff, 0x010101010101ff01, 0x010101010101ffff, + 0x0101010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0x0101010101ffffff, + 0x01010101ff010101, 0x01010101ff0101ff, 0x01010101ff01ff01, 0x01010101ff01ffff, + 0x01010101ffff0101, 0x01010101ffff01ff, 0x01010101ffffff01, 0x01010101ffffffff, + 0x010101ff01010101, 0x010101ff010101ff, 0x010101ff0101ff01, 0x010101ff0101ffff, + 0x010101ff01ff0101, 0x010101ff01ff01ff, 0x010101ff01ffff01, 0x010101ff01ffffff, + 0x010101ffff010101, 0x010101ffff0101ff, 0x010101ffff01ff01, 0x010101ffff01ffff, + 0x010101ffffff0101, 0x010101ffffff01ff, 0x010101ffffffff01, 0x010101ffffffffff, + 0x0101ff0101010101, 0x0101ff01010101ff, 0x0101ff010101ff01, 0x0101ff010101ffff, + 0x0101ff0101ff0101, 0x0101ff0101ff01ff, 0x0101ff0101ffff01, 0x0101ff0101ffffff, + 0x0101ff01ff010101, 0x0101ff01ff0101ff, 0x0101ff01ff01ff01, 0x0101ff01ff01ffff, + 0x0101ff01ffff0101, 0x0101ff01ffff01ff, 0x0101ff01ffffff01, 0x0101ff01ffffffff, + 0x0101ffff01010101, 0x0101ffff010101ff, 0x0101ffff0101ff01, 0x0101ffff0101ffff, + 0x0101ffff01ff0101, 0x0101ffff01ff01ff, 0x0101ffff01ffff01, 0x0101ffff01ffffff, + 0x0101ffffff010101, 0x0101ffffff0101ff, 0x0101ffffff01ff01, 0x0101ffffff01ffff, + 0x0101ffffffff0101, 0x0101ffffffff01ff, 0x0101ffffffffff01, 0x0101ffffffffffff, + 0x01ff010101010101, 0x01ff0101010101ff, 0x01ff01010101ff01, 0x01ff01010101ffff, + 0x01ff010101ff0101, 0x01ff010101ff01ff, 0x01ff010101ffff01, 0x01ff010101ffffff, + 0x01ff0101ff010101, 0x01ff0101ff0101ff, 0x01ff0101ff01ff01, 0x01ff0101ff01ffff, + 0x01ff0101ffff0101, 0x01ff0101ffff01ff, 0x01ff0101ffffff01, 0x01ff0101ffffffff, + 0x01ff01ff01010101, 0x01ff01ff010101ff, 0x01ff01ff0101ff01, 0x01ff01ff0101ffff, + 0x01ff01ff01ff0101, 0x01ff01ff01ff01ff, 0x01ff01ff01ffff01, 0x01ff01ff01ffffff, + 0x01ff01ffff010101, 0x01ff01ffff0101ff, 0x01ff01ffff01ff01, 0x01ff01ffff01ffff, + 0x01ff01ffffff0101, 0x01ff01ffffff01ff, 0x01ff01ffffffff01, 0x01ff01ffffffffff, + 0x01ffff0101010101, 0x01ffff01010101ff, 0x01ffff010101ff01, 0x01ffff010101ffff, + 0x01ffff0101ff0101, 0x01ffff0101ff01ff, 0x01ffff0101ffff01, 0x01ffff0101ffffff, + 0x01ffff01ff010101, 0x01ffff01ff0101ff, 0x01ffff01ff01ff01, 0x01ffff01ff01ffff, + 0x01ffff01ffff0101, 0x01ffff01ffff01ff, 0x01ffff01ffffff01, 0x01ffff01ffffffff, + 0x01ffffff01010101, 0x01ffffff010101ff, 0x01ffffff0101ff01, 0x01ffffff0101ffff, + 0x01ffffff01ff0101, 0x01ffffff01ff01ff, 0x01ffffff01ffff01, 0x01ffffff01ffffff, + 0x01ffffffff010101, 0x01ffffffff0101ff, 0x01ffffffff01ff01, 0x01ffffffff01ffff, + 0x01ffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0x01ffffffffffffff, + 0xff01010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0xff0101010101ffff, + 0xff01010101ff0101, 0xff01010101ff01ff, 0xff01010101ffff01, 0xff01010101ffffff, + 0xff010101ff010101, 0xff010101ff0101ff, 0xff010101ff01ff01, 0xff010101ff01ffff, + 0xff010101ffff0101, 0xff010101ffff01ff, 0xff010101ffffff01, 0xff010101ffffffff, + 0xff0101ff01010101, 0xff0101ff010101ff, 0xff0101ff0101ff01, 0xff0101ff0101ffff, + 0xff0101ff01ff0101, 0xff0101ff01ff01ff, 0xff0101ff01ffff01, 0xff0101ff01ffffff, + 0xff0101ffff010101, 0xff0101ffff0101ff, 0xff0101ffff01ff01, 0xff0101ffff01ffff, + 0xff0101ffffff0101, 0xff0101ffffff01ff, 0xff0101ffffffff01, 0xff0101ffffffffff, + 0xff01ff0101010101, 0xff01ff01010101ff, 0xff01ff010101ff01, 0xff01ff010101ffff, + 0xff01ff0101ff0101, 0xff01ff0101ff01ff, 0xff01ff0101ffff01, 0xff01ff0101ffffff, + 0xff01ff01ff010101, 0xff01ff01ff0101ff, 0xff01ff01ff01ff01, 0xff01ff01ff01ffff, + 0xff01ff01ffff0101, 0xff01ff01ffff01ff, 0xff01ff01ffffff01, 0xff01ff01ffffffff, + 0xff01ffff01010101, 0xff01ffff010101ff, 0xff01ffff0101ff01, 0xff01ffff0101ffff, + 0xff01ffff01ff0101, 0xff01ffff01ff01ff, 0xff01ffff01ffff01, 0xff01ffff01ffffff, + 0xff01ffffff010101, 0xff01ffffff0101ff, 0xff01ffffff01ff01, 0xff01ffffff01ffff, + 0xff01ffffffff0101, 0xff01ffffffff01ff, 0xff01ffffffffff01, 0xff01ffffffffffff, + 0xffff010101010101, 0xffff0101010101ff, 0xffff01010101ff01, 0xffff01010101ffff, + 0xffff010101ff0101, 0xffff010101ff01ff, 0xffff010101ffff01, 0xffff010101ffffff, + 0xffff0101ff010101, 0xffff0101ff0101ff, 0xffff0101ff01ff01, 0xffff0101ff01ffff, + 0xffff0101ffff0101, 0xffff0101ffff01ff, 0xffff0101ffffff01, 0xffff0101ffffffff, + 0xffff01ff01010101, 0xffff01ff010101ff, 0xffff01ff0101ff01, 0xffff01ff0101ffff, + 0xffff01ff01ff0101, 0xffff01ff01ff01ff, 0xffff01ff01ffff01, 0xffff01ff01ffffff, + 0xffff01ffff010101, 0xffff01ffff0101ff, 0xffff01ffff01ff01, 0xffff01ffff01ffff, + 0xffff01ffffff0101, 0xffff01ffffff01ff, 0xffff01ffffffff01, 0xffff01ffffffffff, + 0xffffff0101010101, 0xffffff01010101ff, 0xffffff010101ff01, 0xffffff010101ffff, + 0xffffff0101ff0101, 0xffffff0101ff01ff, 0xffffff0101ffff01, 0xffffff0101ffffff, + 0xffffff01ff010101, 0xffffff01ff0101ff, 0xffffff01ff01ff01, 0xffffff01ff01ffff, + 0xffffff01ffff0101, 0xffffff01ffff01ff, 0xffffff01ffffff01, 0xffffff01ffffffff, + 0xffffffff01010101, 0xffffffff010101ff, 0xffffffff0101ff01, 0xffffffff0101ffff, + 0xffffffff01ff0101, 0xffffffff01ff01ff, 0xffffffff01ffff01, 0xffffffff01ffffff, + 0xffffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0xffffffffff01ffff, + 0xffffffffffff0101, 0xffffffffffff01ff, 0xffffffffffffff01, 0xffffffffffffffff, +}; + +struct SignHelper { + + IQK_ALWAYS_INLINE void apply_signs_1x(uint8x16_t * b, const uint8_t * sign_bits) const { + auto s = vreinterpretq_s8_u64(uint64x2_t{kall_signs[sign_bits[0]], kall_signs[sign_bits[1]]}); + // Normally we would expect this to be faster, but it isn't. + // auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1])); + // auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1)); + b[0] = vreinterpretq_u8_s8(vmulq_s8(vreinterpretq_s8_u8(b[0]), s)); + } + + // We would need these two if we weren't loading from the unpacked sign table. + //const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201)); + //const uint8x16_t m1 = vdupq_n_u8(1); +}; + +struct DequantizerIQ2S final : public BaseDequantizer { + DequantizerIQ2S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 16; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x4_t new_block(int i) { + d = 0.125f * GGML_FP16_TO_FP32(x[i].d); + prepare_internal(i, 0, bits); + return prepare_4bit_scales16(x[i].scales); + } + + inline void prepare(int i, int j) { + if (j == 1) prepare_internal(i, 1, bits); + } + +private: + + static void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, uint8x16_t * b) { + uint32_t aux32[2]; + const uint16_t * aux16 = (const uint16_t *)aux32; + for (int k = 0; k < 2; ++k) { + aux32[1] = (qh[k] << 4) | (qh[k] << 18); + aux32[0] = (aux32[1] << 4) & 0x03000300; + aux32[1] &= 0x03000300; + b[2*k+0] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+0] | aux16[0]))), + vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+1] | aux16[1])))); + b[2*k+1] = vcombine_u8(vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+2] | aux16[2]))), + vld1_u8((const uint8_t *)(iq2s_grid + (qs[4*k+3] | aux16[3])))); + sh.apply_signs_1x(b+2*k+0, sign_bits); sign_bits += 2; + sh.apply_signs_1x(b+2*k+1, sign_bits); sign_bits += 2; + } + } + + void prepare_internal(int i, int j, SimpleBits& sb) { + + const auto * qs = x[i].qs + 16*j; + const auto * qh = x[i].qh + 4*j; + const auto * sign_bits = qs + QK_K/8; + + make4(sh, sign_bits+0, qs+0, qh+0, sb.b1.val); + make4(sh, sign_bits+8, qs+8, qh+2, sb.b2.val); + } + + SignHelper sh; +}; + +struct DequantizerIQ3XXS final : public BaseDequantizer { + DequantizerIQ3XXS(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + IQK_ALWAYS_INLINE float new_block(int i) const { return 0.25f * GGML_FP16_TO_FP32(x[i].d); } + + inline int32x4_t unpack(int i, int j, uint8x16_t * q) const { + auto q3data = vld1q_u8_x2(x[i].qs + 32*j); + auto gas = vld1q_u32((const uint32_t *)(x[i].qs + QK_K/4 + 16*j)); + prepare_block((const uint8_t *)q3data.val, (const uint32_t *)&gas, q); + return prepare_scales_8(gas); + } + +private: + + inline static void make2(const uint8_t * q3, const uint32_t sidx, uint8x16_t * b) { + b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[0]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[3]]}); + b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3xxs_grid[q3[4]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[7]]}); + apply_signs_2(b, keven_signs, sidx); + } + inline static void prepare_block(const uint8_t * q3, const uint32_t * signs, uint8x16_t * quants) { + make2(q3+ 0, signs[0], quants + 0); + make2(q3+ 8, signs[1], quants + 2); + make2(q3+16, signs[2], quants + 4); + make2(q3+24, signs[3], quants + 6); + } +}; + +struct DequantizerIQ3S final : public BaseDequantizer { + DequantizerIQ3S(const void * vx, size_t bx, int nrc) : BaseDequantizer(vx, bx, nrc) {} + + constexpr static int num_blocks() { return 8; } + constexpr static bool should_scale_quants() { return false; } + + SimpleBits bits; + float d; + + inline int32x4x2_t new_block(int i) { + d = GGML_FP16_TO_FP32(x[i].d); + uint32_t scales32[2]; + auto qs = vld1q_u8_x2(x[i].qs); + auto signs = vld1q_u8(x[i].signs); + + prepare_block((const uint8_t *)qs.val, x[i].qh, (const uint8_t *)&signs); + + std::memcpy(scales32, x[i].scales, 4); + scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; + scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; + auto scales8 = vld1_u8((const uint8_t *)scales32); // 0, 2, 4, 6, 1, 3, 5, 7 + scales8 = vtbl1_u8(scales8, vreinterpret_u8_u64(vdup_n_u64(0x0703060205010400))); + auto scales16 = vreinterpretq_s16_u16(vmovl_u8(scales8)); + int32x4x2_t scales; + scales.val[0] = vmovl_s16(vget_low_s16(scales16)); + scales.val[1] = vmovl_s16(vget_high_s16(scales16)); + return scales; + } + + inline void prepare(int i, int j) { + if (j == 1) { + auto qs = vld1q_u8_x2(x[i].qs + 32); + auto signs = vld1q_u8(x[i].signs + 16); + prepare_block((const uint8_t *)qs.val, x[i].qh + 4, (const uint8_t *)&signs); + } + } + +private: + + static inline void make2(const SignHelper& sh, const uint8_t * sign_bits, const uint16x8_t& idx_l, uint8_t qh, + const int16x8_t& hshift, uint8x16_t * b) { + auto vindex = vorrq_u16(idx_l, vandq_u16(vshlq_u16(vdupq_n_u16(qh), hshift), vdupq_n_u16(256))); + const uint16_t * idx = (const uint16_t *)&vindex; + b[0] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[0]], iq3s_grid[idx[1]], iq3s_grid[idx[2]], iq3s_grid[idx[3]]}); + sh.apply_signs_1x(b+0, sign_bits+0); + b[1] = vreinterpretq_u8_u32(uint32x4_t{iq3s_grid[idx[4]], iq3s_grid[idx[5]], iq3s_grid[idx[6]], iq3s_grid[idx[7]]}); + sh.apply_signs_1x(b+1, sign_bits+2); + } + static inline void make4(const SignHelper& sh, const uint8_t * sign_bits, const uint8_t * qs, const uint8_t * qh, + const int16x8_t& hshift, uint8x16_t * b) { + auto idx_l = vld1q_u8(qs); + make2(sh, sign_bits+0, vmovl_u8(vget_low_u8 (idx_l)), qh[0], hshift, b+0); + make2(sh, sign_bits+4, vmovl_u8(vget_high_u8(idx_l)), qh[1], hshift, b+2); + } + + static int16x8_t load_shift() { + static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; + return vld1q_s16(k_shift); + } + + inline void prepare_block(const uint8_t * qs, const uint8_t * qh, const uint8_t * sign_bits) { + auto signs = vld1q_u8(sign_bits); + auto s = (const uint8_t *)&signs; + make4(sh, s + 0, qs+ 0, qh+0, hshift, bits.b1.val); + make4(sh, s + 8, qs+16, qh+2, hshift, bits.b2.val); + } + + SignHelper sh; + const int16x8_t hshift = load_shift(); + +}; + +template +IQK_NOINLINE void mul_mat_qX_K_q8_K_IQXXS(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + assert(n % QK_K == 0); + const int nb = n / QK_K; + + Q8 q8(info); + Dequantizer deq(vx, bx, nrc_y); + uint8x16_t qx[8]; + int32x4_t sumi[nrc_y]; + float32x4_t acc[nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + for (int iy = 0; iy < nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb; ++i) { + float d = deq.new_block(i); + auto scales = deq.unpack(i, 0, qx); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + sumi[iy] = vdupq_n_s32(0); + compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 0, sumi[iy]); + } + scales = deq.unpack(i, 1, qx); +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + compute_8_blocks((const int8x16_t *)qx, q8, scales, iy, i, 1, sumi[iy]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*q8.scale(iy, i)), vcvtq_f32_s32(sumi[iy])); + } + } +#pragma GCC unroll 8 + for (int iy = 0; iy < nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +// =========================================== Legacy quants + +template +inline float16x4_t load_scales_q0(const Block * x, ggml_half * aux) { + for (int k = 0; k < 4; ++k) aux[k] = x[k].d; + return vld1_f16((const float16_t *)aux); +} + +template +inline float16x8_t load_scales_q1(const Block * x, ggml_half * aux) { + if constexpr (std::is_same_v) { + for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].s; } + } else { + for (int k = 0; k < 4; ++k) { aux[k] = x[k].d; aux[k+4] = x[k].m; } + } + return vld1q_f16((const float16_t *)aux); +} + +struct Q4LegacyBits { + template + inline void prepare(const Block * x) { + for (int i = 0; i < 4; ++i) { + auto q4bits = vld1q_u8(x[i].qs); + b[2*i+0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); + b[2*i+1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); + } + } + inline void prepare1(const uint8_t * qs, int8x16_t * q) const { + auto q4bits = vld1q_u8(qs); + q[0] = vreinterpretq_s8_u8(vandq_u8(q4bits, m4b)); + q[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits, 4)); + } + inline void prepare1(const uint8_t * qs) { + prepare1(qs, b); + } + const uint8x16_t m4b = vdupq_n_u8(0xf); + int8x16_t b[8]; +}; + +// One would think this commented out version would do better than the one below +// because it offers more opportunities to execute instructions in parallel. +// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers +// cannot it just do the sequential version below on its own? +//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { +// const auto q8b_1 = vld1q_s8_x2(qs + 0); +// auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]); +// const auto q8b_2 = vld1q_s8_x2(qs + 32); +// auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]); +// auto p1234 = vpaddq_s32(p12, p34); +// const auto q8b_3 = vld1q_s8_x2(qs + 64); +// auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]); +// const auto q8b_4 = vld1q_s8_x2(qs + 96); +// auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]); +// return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); +//} + +inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) { + auto q8b = vld1q_s8_x2(qs + 0); + auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b.val[0]), b[1], q8b.val[1]); + q8b = vld1q_s8_x2(qs + 32); + auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b.val[0]), b[3], q8b.val[1]); + auto p1234 = vpaddq_s32(p12, p34); + q8b = vld1q_s8_x2(qs + 64); + auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b.val[0]), b[5], q8b.val[1]); + q8b = vld1q_s8_x2(qs + 96); + auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b.val[0]), b[7], q8b.val[1]); + return vpaddq_s32(p1234, vpaddq_s32(p56, p78)); +} + +template struct Q80 { + + constexpr static int nrc_y = nrc; + + Q80(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_0 *)info.src1_row(iy); + } + + inline const int8_t * quant_data(int iy, int i) const { + const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + return y4->qs; + } + + inline float16x4_t load_scales(int iy, int i) const { + const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y[iy] + i; + return vld1_f16((const float16_t *)y4->d); + } + + template + inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * /*acc*/) const { + auto qx_scales = deq.new_block(i); + for (int iy = 0; iy < nrc; ++iy) { + auto q8_scales = load_scales(iy, i); + sc16[iy] = vmul_f16(qx_scales, q8_scales); + } + } + + template + inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { + deq.prepare1(i); + float d = GGML_FP16_TO_FP32(deq.x[i].d); + for (int iy = 0; iy < nrc; ++iy) { + auto q8b = vld1q_s8_x2(y[iy][i].qs); + auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); + } + } + + const block_q8_0 * y[nrc_y]; +}; + +template struct Q81 { + + constexpr static int nrc_y = nrc; + + Q81(const DataInfo& info) { + for (int iy = 0; iy < nrc_y; ++iy) y[iy] = (const block_q8_1 *)info.src1_row(iy); + } + + inline const int8_t * quant_data(int iy, int i) const { + const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + return y4->qs; + } + + inline float16x8_t load_scales(int iy, int i) const { + const block_q8_1_x4 * y4 = (const block_q8_1_x4 *)y[iy] + i; + return vld1q_f16((const float16_t *)y4->d); + } + + template + inline void process_scales(int i, Dequantizer& deq, float16x4_t * sc16, float32x4_t * acc) const { + auto qx_scales = deq.new_block(i); + for (int iy = 0; iy < nrc; ++iy) { + auto q8_scales = load_scales(iy, i); + auto m = vmul_f16(vget_high_f16(qx_scales), vget_high_f16(q8_scales)); + acc[iy] = vaddq_f32(acc[iy], vcvt_f32_f16(m)); + sc16[iy] = vmul_f16(vget_low_f16(qx_scales), vget_low_f16(q8_scales)); + } + } + + template + inline void process_1_block(int i, Dequantizer& deq, float32x4_t * acc) const { + deq.prepare1(i); + float d = GGML_FP16_TO_FP32(deq.x[i].d), m = 0.25f*GGML_FP16_TO_FP32(deq.x[i].m); + for (int iy = 0; iy < nrc; ++iy) { + auto q8b = vld1q_s8_x2(y[iy][i].qs); + auto p = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), deq.bits.b[0], q8b.val[0]), deq.bits.b[1], q8b.val[1]); + acc[iy] = vmlaq_f32(acc[iy], vdupq_n_f32(d*GGML_FP16_TO_FP32(y[iy][i].d)), vcvtq_f32_s32(p)); + acc[iy] = vaddq_f32(acc[iy], vdupq_n_f32(m*GGML_FP16_TO_FP32(y[iy][i].s))); + } + } + + const block_q8_1 * y[nrc_y]; +}; + +template +struct BaseLegacyDequantizer { + + BaseLegacyDequantizer(const void * vx, size_t bx) : vx(vx), x(nullptr), bx(bx) {} + + inline void new_row(int ix) { x = (const block_q *)((const char *)vx + bx*ix); } + + Q4LegacyBits bits; + + const void * vx; + const block_q * x; + size_t bx; +}; + +struct DequantizerQ40 final : public BaseLegacyDequantizer { + + DequantizerQ40(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + q[0] = vaddq_s8(q[0], m8); + q[1] = vaddq_s8(q[1], m8); + } + inline void prepare1(int i) { + prepare1(i, bits.b); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + prepare1(4*i+k, bits.b + 2*k); + } + return vld1_f16((const float16_t *)aux); + } + + const int8x16_t m8 = vdupq_n_s8(-8); + //ggml_half aux[4]; +}; + +struct DequantizerQ41 : public BaseLegacyDequantizer { + + DequantizerQ41(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i) { + bits.prepare1(x[i].qs); + } + + inline float16x8_t new_block(int i) { + uint32_t aux32[4]; + const uint32_t * s32 = (const uint32_t *)&x[4*i].d; + for (int k = 0; k < 4; ++k) { + aux32[k] = *s32; s32 += sizeof(block_q4_1)/4; + bits.prepare1(x[4*i+k].qs, bits.b + 2*k); + } + return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); + } + // Leaving this commented out attempt to be reminded that I already tried this. + // It has basically the same performance as the version above. + //inline float16x8_t new_block(int i) { + // uint32x4_t scales = {}; + // const block_q4_1 * xi = x + 4*i; + // const uint32_t * s32 = (const uint32_t *)&xi->d; + // scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[0].qs, bits.b + 0); + // scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[1].qs, bits.b + 2); + // scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4; + // bits.prepare1(xi[2].qs, bits.b + 4); + // scales = vsetq_lane_u32(*s32, scales, 3); + // bits.prepare1(xi[3].qs, bits.b + 6); + // return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle))); + //} + + const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; +}; + +struct HighBit5Legacy { + inline uint8x16_t to_bytes(const uint8_t * qh) const { + uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); + return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vreinterpretq_u8_u64(mask)); + } + inline uint8x16_t to_negated_bytes(const uint8_t * qh) const { + uint8x16_t h = vqtbl1q_u8(vreinterpretq_u8_u16(vdupq_n_u16(*(const uint16_t *)qh)), shuffle); + return vceqq_u8(vandq_u8(h, vreinterpretq_u8_u64(mask)), vdupq_n_u8(0)); + } + const uint64x2_t mask = vdupq_n_u64(0x8040201008040201); + const uint8x16_t shuffle = vcombine_u8(vdup_n_u8(0), vdup_n_u8(1)); +}; + +struct DequantizerQ50 final : public BaseLegacyDequantizer { + + DequantizerQ50(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + auto qh = x[i].qh; + q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_negated_bytes(qh+0)))); + q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_negated_bytes(qh+2)))); + } + inline void prepare1(int i) { + prepare1(i, bits.b); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + prepare1(4*i+k, bits.b + 2*k); + } + return vld1_f16((const float16_t *)aux); + } + + HighBit5Legacy hbits; + + const uint8x16_t mh = vdupq_n_u8(0xf0); + +}; + +struct DequantizerQ80 final : public BaseLegacyDequantizer { + + DequantizerQ80(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i) { + bits.b[0] = vld1q_s8(x[i].qs); + bits.b[1] = vld1q_s8(x[i].qs+16); + } + + inline float16x4_t new_block(int i) { + ggml_half aux[4]; + for (int k = 0; k < 4; ++k) { + aux[k] = x[4*i+k].d; + bits.b[2*k+0] = vld1q_s8(x[4*i+k].qs); + bits.b[2*k+1] = vld1q_s8(x[4*i+k].qs+16); + } + return vld1_f16((const float16_t *)aux); + } + +}; + +struct DequantizerQ51 final : public BaseLegacyDequantizer { + + DequantizerQ51(const void * vx, size_t bx) : BaseLegacyDequantizer(vx, bx) {} + + inline void prepare1(int i, int8x16_t * q) const { + bits.prepare1(x[i].qs, q); + auto qh = x[i].qh; + q[0] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[0]), vandq_u8(mh, hbits.to_bytes(qh+0)))); + q[1] = vreinterpretq_s8_u8(vorrq_u8(vreinterpretq_u8_s8(q[1]), vandq_u8(mh, hbits.to_bytes(qh+2)))); + } + inline void prepare1(int i) { + bits.prepare1(x[i].qs, bits.b); + } + + inline float16x8_t new_block(int i) { + uint32_t aux32[4]; + const uint32_t * s32 = (const uint32_t *)&x[4*i].d; + for (int k = 0; k < 4; ++k) { + aux32[k] = *s32; s32 += sizeof(block_q5_1)/4; + prepare1(4*i+k, bits.b + 2*k); + } + return vreinterpretq_f16_u8(vqtbl1q_u8(vld1q_u8((const uint8_t *)aux32), vreinterpretq_u8_u64(shuffle))); + } + + HighBit5Legacy hbits; + + const uint8x16_t mh = vdupq_n_u8(0x10); + const uint64x2_t shuffle = {0x0d0c090805040100, 0x0f0e0b0a07060302}; + +}; + +template +inline void sum_4(int i, Dequantizer& deq, const Q8& q8, const float16x4_t * sc16, float32x4_t * acc) { + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + auto pall = sum_4_blocks(deq.bits.b, q8.quant_data(iy, i)); + auto scale = vcvt_f32_f16(sc16[iy]); + acc[iy] = vmlaq_f32(acc[iy], scale, vcvtq_f32_s32(pall)); + } +} + +template +inline void mul_mat_qX_Y_q8_Y(int n, Dequantizer& deq, Q8& q8, const DataInfo& info, int nrc_x) { + const int nb = n / QK4_1; + + float16x4_t sc16[Q8::nrc_y]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq.new_row(ix); + + float32x4_t acc[Q8::nrc_y]; + for (int iy = 0; iy < Q8::nrc_y; ++iy) acc[iy] = vdupq_n_f32(0.f); + + for (int i = 0; i < nb/4; ++i) { + q8.process_scales(i, deq, sc16, acc); + sum_4(i, deq, q8, sc16, acc); + } + for (int i = 4*(nb/4); i < nb; ++i) { + q8.process_1_block(i, deq, acc); + } + + for (int iy = 0; iy < Q8::nrc_y; ++iy) { + info.store(ix, iy, vaddvq_f32(acc[iy])); + } + } +} + +template +inline void mul_mat_qX_Y_q8_Y_1(int n, Dequantizer& deq1, Dequantizer& deq2, Q8& q8, const DataInfo& info, int nrc_x) { + const int nb = n / QK4_1; + + float16x4_t sc16[2]; + + for (int ix = 0; ix < nrc_x; ++ix) { + + deq1.new_row(ix); + deq2.new_row(ix); + + float32x4_t acc[2] = { vdupq_n_f32(0.f), vdupq_n_f32(0.f) }; + + for (int i = 0; i < nb/8; ++i) { + q8.process_scales(2*i+0, deq1, sc16+0, acc+0); + q8.process_scales(2*i+1, deq2, sc16+1, acc+1); + sum_4(2*i+0, deq1, q8, sc16+0, acc+0); + sum_4(2*i+1, deq2, q8, sc16+1, acc+1); + } + for (int i = 2*(nb/8); i < nb/4; ++i) { + q8.process_scales(i, deq1, sc16, acc); + sum_4(i, deq1, q8, sc16, acc); + } + for (int i = 4*(nb/4); i < nb; ++i) { + q8.process_1_block(i, deq1, acc); + } + + info.store(ix, 0, vaddvq_f32(vaddq_f32(acc[0], acc[1]))); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_1_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Q81 q8(info); + if constexpr (nrc_y == 1) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); + } else { + Dequantizer deq(vx, bx); + mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_0_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Q80 q8(info); + if constexpr (nrc_y == 1) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); + } else { + Dequantizer deq(vx, bx); + mul_mat_qX_Y_q8_Y(n, deq, q8, info, nrc_x); + } +} + +template +static void IQK_NOINLINE mul_mat_qX_1_q8_1_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + Q81<1> q8(info); + mul_mat_qX_Y_q8_Y_1(n, deq1, deq2, q8, info, nrc_x); +} + +template +static void IQK_NOINLINE mul_mat_qX_0_q8_0_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) { + Dequantizer deq1(vx, bx), deq2(vx, bx); + Q80<1> q8(info); + mul_mat_qX_Y_q8_Y(n, deq1, deq2, q8, info, nrc_x); +} + +template void MulMat::set_functions(MulMat& m) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_0_q8_0; + m.funcs[1] = mul_mat_qX_0_q8_0; + m.funcs[2] = mul_mat_qX_0_q8_0; + m.funcs[3] = mul_mat_qX_0_q8_0; + m.funcs[4] = mul_mat_qX_0_q8_0; + m.funcs[5] = mul_mat_qX_0_q8_0; + m.funcs[6] = mul_mat_qX_0_q8_0; + m.funcs[7] = mul_mat_qX_0_q8_0; + } + else if constexpr (std::is_same_v || std::is_same_v) { + m.funcs[0] = mul_mat_qX_1_q8_1; + m.funcs[1] = mul_mat_qX_1_q8_1; + m.funcs[2] = mul_mat_qX_1_q8_1; + m.funcs[3] = mul_mat_qX_1_q8_1; + m.funcs[4] = mul_mat_qX_1_q8_1; + m.funcs[5] = mul_mat_qX_1_q8_1; + m.funcs[6] = mul_mat_qX_1_q8_1; + m.funcs[7] = mul_mat_qX_1_q8_1; + } + else if constexpr (std::is_same_v || std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQXXS<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_IQXXS<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_IQXXS<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_IQXXS<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_IQXXS<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_IQXXS<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_IQXXS<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_IQXXS<8, Dequantizer>; + } + else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + m.funcs[0] = mul_mat_qX_K_q8_K_IQ<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_IQ<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_IQ<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_IQ<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_IQ<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_IQ<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_IQ<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_IQ<8, Dequantizer>; + } + else { + m.funcs[0] = mul_mat_qX_K_q8_K_T<1, Dequantizer>; + m.funcs[1] = mul_mat_qX_K_q8_K_T<2, Dequantizer>; + m.funcs[2] = mul_mat_qX_K_q8_K_T<3, Dequantizer>; + m.funcs[3] = mul_mat_qX_K_q8_K_T<4, Dequantizer>; + m.funcs[4] = mul_mat_qX_K_q8_K_T<5, Dequantizer>; + m.funcs[5] = mul_mat_qX_K_q8_K_T<6, Dequantizer>; + m.funcs[6] = mul_mat_qX_K_q8_K_T<7, Dequantizer>; + m.funcs[7] = mul_mat_qX_K_q8_K_T<8, Dequantizer>; + } +} + +bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& m, int& row_size_q8, int Ny) { + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + + (void)Ny; + // Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications. + //if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S || + // typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false; + + switch (typeA) { + case GGML_TYPE_Q2_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q3_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q4_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q5_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q6_K: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ4_XS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ3_S: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ3_XXS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_S: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_XS: + MulMat::set_functions(m); + break; + case GGML_TYPE_IQ2_XXS: + MulMat::set_functions(m); + break; + case GGML_TYPE_Q4_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + case GGML_TYPE_Q4_1: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); + break; + case GGML_TYPE_Q5_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + case GGML_TYPE_Q5_1: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00); + break; + case GGML_TYPE_Q8_0: + MulMat::set_functions(m); + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00); + break; + default: + return false; + } + return true; +} + +} + +#endif // __x86_64__ or __aarch64__ diff --git a/third_party/llamafile/sgemm.cpp b/third_party/llamafile/sgemm.cpp index 38f6d18..b4ef549 100644 --- a/third_party/llamafile/sgemm.cpp +++ b/third_party/llamafile/sgemm.cpp @@ -1,204 +1,7 @@ -// Adapted from -// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp -// Copyrigth 2024 Mozilla Foundation. -// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. - -// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi -// -// Copyright 2024 Mozilla Foundation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "sgemm.h" -// #include -// #include -// #include -#include -// #include -#include -// #include "llamafile.h" - -static const struct GemmFuncs { - bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); - bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); - bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); - // typeof(llamafile_sgemm)* sgemm; - // typeof(llamafile_mixmul)* mixmul; - // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; - GemmFuncs() { -#if defined(__x86_64__) || defined(_M_X64) - // if (X86_HAVE(AVX)) { - // if (X86_HAVE(FMA)) { - // if (X86_HAVE(AVX2)) { - // if (X86_HAVE(AVX512F)) { - // if (X86_HAVE(AVX512VL) && // - // X86_HAVE(AVX512BW) && // - // X86_HAVE(AVX512DQ) && // - // X86_HAVE(AVX512_VNNI) && // - // X86_HAVE(AVX512_BF16)) { - // // AMD Zen4+ (2023-) - // sgemm = llamafile_sgemm_amd_zen4; - // mixmul = llamafile_mixmul_amd_zen4; - // iqk_mixmul = iqk_mul_mat_moe_zen4; - // } else { - // // Intel Xeon Skylake+ (2015-) - // sgemm = llamafile_sgemm_amd_avx512f; - // mixmul = llamafile_mixmul_amd_avx512f; - // iqk_mixmul = iqk_mul_mat_moe; - // } - // } else if (X86_HAVE(AVXVNNI)) { - // // Intel Alderlake (2021-) - // sgemm = llamafile_sgemm_amd_avxvnni; - // mixmul = llamafile_mixmul_amd_avxvnni; - // iqk_mixmul = iqk_mul_mat_moe; - // } else { - // // Intel Haswell/Broadwell/Skylake (2013-2020) - // // AMD Excavator (2015-2022) - // sgemm = llamafile_sgemm_amd_avx2; - // mixmul = llamafile_mixmul_amd_avx2; - // if (X86_HAVE(F16C)) - // iqk_mixmul = iqk_mul_mat_moe; - // } - // } else { - // // AMD Piledriver (2011-2014) - // sgemm = llamafile_sgemm_amd_fma; - // mixmul = llamafile_mixmul_amd_fma; - // if (X86_HAVE(F16C)) - // iqk_mixmul = iqk_mul_mat_moe; - // } - // } else { - // // Intel Sandybridge/Ivybridge (2010-2012) - // // AMD Bulldozer (2011) - // sgemm = llamafile_sgemm_amd_avx; - // mixmul = llamafile_mixmul_amd_avx; - // } - // } else { - // // AMD K8/Barcelona (2003-2010) - // // Intel Core/Nehalem (2006-2009) - // sgemm = llamafile_sgemm_unsupported; - // mixmul = llamafile_mixmul_unsupported; - // } - -#if defined(__AVX__) -#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) -#if defined(__AVX2__) -#if defined(__AVX512F__) -#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) - // AMD Zen4+ (2023-) - sgemm = llamafile_sgemm_amd_zen4; - mixmul = llamafile_mixmul_amd_zen4; - iqk_mixmul = iqk_mul_mat_moe_zen4; +#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU + // 使用 x86 版本 + #include "sgemm_arm.cpp" #else - // Intel Xeon Skylake+ (2015-) - sgemm = llamafile_sgemm_amd_avx512f; - mixmul = llamafile_mixmul_amd_avx512f; - iqk_mixmul = iqk_mul_mat_moe; -#endif -#elif defined(__AVXVNNI__) - // Intel Alderlake (2021-) - sgemm = llamafile_sgemm_amd_avxvnni; - mixmul = llamafile_mixmul_amd_avxvnni; - iqk_mixmul = iqk_mul_mat_moe; -#else - // Intel Haswell/Broadwell/Skylake (2013-2020) - // AMD Excavator (2015-2022) - sgemm = llamafile_sgemm_amd_avx2; - mixmul = llamafile_mixmul_amd_avx2; -#if defined(__F16C__) - iqk_mixmul = iqk_mul_mat_moe; -#endif -#endif -#else - // AMD Piledriver (2011-2014) - sgemm = llamafile_sgemm_amd_fma; - mixmul = llamafile_mixmul_amd_fma; -#if defined(__F16C__) - iqk_mixmul = iqk_mul_mat_moe; -#endif -#endif -#else - // Intel Sandybridge/Ivybridge (2010-2012) - // AMD Bulldozer (2011) - sgemm = llamafile_sgemm_amd_avx; - mixmul = llamafile_mixmul_amd_avx; -#endif -#else - // AMD K8/Barcelona (2003-2010) - // Intel Core/Nehalem (2006-2009) - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; -#endif - -#elif defined(__aarch64__) - long hwcap = getauxval(AT_HWCAP); - if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) - (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) - (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) - // e.g. Apple M1, Raspberry Pi 5 - sgemm = llamafile_sgemm_arm82; - mixmul = llamafile_mixmul_arm82; - iqk_mixmul = iqk_mul_mat_moe_arm82; - } else { - // ARM64 baseline ISA - sgemm = llamafile_sgemm_arm80; - mixmul = llamafile_mixmul_arm80; - } -#else - sgemm = llamafile_sgemm_unsupported; - mixmul = llamafile_mixmul_unsupported; -#endif - } -} funcs; - -/** - * Performs optimized matrix multiplication on CPU. - * - * This subroutine may compute C = Aᵀ * B with column major ordering. - * Despite its name, this isn't a generalized implementation. Work is - * only performed when a handwritten kernel is written and available. - * Otherwise the caller should fall back to a general matmul routine. - * - * @param m is rows in `A` and `C` - * @param n is cols in `B` and `C` - * @param k is cols in `A` and rows in `B` - * @param A is first input matrix (always transposed) - * @param lda is row stride of `A` - * @param B is second input matrix (never transposed) - * @param ldb is row stride of `B` - * @param C is input/output array of output matrices - * @param ldc is row stride of `C` - * @param ith is thread id (must be less than `nth`) - * @param nth is number of threads (must be greater than zero) - * @param task is GGML task type - * @param Atype is GGML data type of `A` - * @param Btype is GGML data type of `B` - * @param Ctype is GGML data type of `C` - * @param precision may be used to control the internal compute type - * @return true if this function was able to service the matmul request - */ -bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { - return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype, - precision); -} - -/** - * Performs "mixture of experts" tensor multiplication on CPU. - */ -bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) { - return funcs.mixmul(params, weights, thought, plan, result); -} - -bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) { - return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth); -} + // 使用 ARM 版本 + #include "sgemm_x86.cpp" +#endif \ No newline at end of file diff --git a/third_party/llamafile/sgemm_arm.cpp b/third_party/llamafile/sgemm_arm.cpp new file mode 100644 index 0000000..8bd54f3 --- /dev/null +++ b/third_party/llamafile/sgemm_arm.cpp @@ -0,0 +1,204 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp +// Copyrigth 2024 Mozilla Foundation. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sgemm.h" +// #include +// #include +// #include +#include +// #include +#include +// #include "llamafile.h" + +static const struct GemmFuncs { + bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); + bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); + bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); + // typeof(llamafile_sgemm)* sgemm; + // typeof(llamafile_mixmul)* mixmul; + // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; + GemmFuncs() { +#if defined(__x86_64__) || defined(_M_X64) + // if (X86_HAVE(AVX)) { + // if (X86_HAVE(FMA)) { + // if (X86_HAVE(AVX2)) { + // if (X86_HAVE(AVX512F)) { + // if (X86_HAVE(AVX512VL) && // + // X86_HAVE(AVX512BW) && // + // X86_HAVE(AVX512DQ) && // + // X86_HAVE(AVX512_VNNI) && // + // X86_HAVE(AVX512_BF16)) { + // // AMD Zen4+ (2023-) + // sgemm = llamafile_sgemm_amd_zen4; + // mixmul = llamafile_mixmul_amd_zen4; + // iqk_mixmul = iqk_mul_mat_moe_zen4; + // } else { + // // Intel Xeon Skylake+ (2015-) + // sgemm = llamafile_sgemm_amd_avx512f; + // mixmul = llamafile_mixmul_amd_avx512f; + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else if (X86_HAVE(AVXVNNI)) { + // // Intel Alderlake (2021-) + // sgemm = llamafile_sgemm_amd_avxvnni; + // mixmul = llamafile_mixmul_amd_avxvnni; + // iqk_mixmul = iqk_mul_mat_moe; + // } else { + // // Intel Haswell/Broadwell/Skylake (2013-2020) + // // AMD Excavator (2015-2022) + // sgemm = llamafile_sgemm_amd_avx2; + // mixmul = llamafile_mixmul_amd_avx2; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // AMD Piledriver (2011-2014) + // sgemm = llamafile_sgemm_amd_fma; + // mixmul = llamafile_mixmul_amd_fma; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // Intel Sandybridge/Ivybridge (2010-2012) + // // AMD Bulldozer (2011) + // sgemm = llamafile_sgemm_amd_avx; + // mixmul = llamafile_mixmul_amd_avx; + // } + // } else { + // // AMD K8/Barcelona (2003-2010) + // // Intel Core/Nehalem (2006-2009) + // sgemm = llamafile_sgemm_unsupported; + // mixmul = llamafile_mixmul_unsupported; + // } + +#if defined(__AVX__) +#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) +#if defined(__AVX2__) +#if defined(__AVX512F__) +#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) + // AMD Zen4+ (2023-) + sgemm = llamafile_sgemm_amd_zen4; + mixmul = llamafile_mixmul_amd_zen4; + iqk_mixmul = iqk_mul_mat_moe_zen4; +#else + // Intel Xeon Skylake+ (2015-) + sgemm = llamafile_sgemm_amd_avx512f; + mixmul = llamafile_mixmul_amd_avx512f; + iqk_mixmul = iqk_mul_mat_moe; +#endif +#elif defined(__AVXVNNI__) + // Intel Alderlake (2021-) + sgemm = llamafile_sgemm_amd_avxvnni; + mixmul = llamafile_mixmul_amd_avxvnni; + iqk_mixmul = iqk_mul_mat_moe; +#else + // Intel Haswell/Broadwell/Skylake (2013-2020) + // AMD Excavator (2015-2022) + sgemm = llamafile_sgemm_amd_avx2; + mixmul = llamafile_mixmul_amd_avx2; +#if defined(__F16C__) + iqk_mixmul = iqk_mul_mat_moe; +#endif +#endif +#else + // AMD Piledriver (2011-2014) + sgemm = llamafile_sgemm_amd_fma; + mixmul = llamafile_mixmul_amd_fma; +#if defined(__F16C__) + iqk_mixmul = iqk_mul_mat_moe; +#endif +#endif +#else + // Intel Sandybridge/Ivybridge (2010-2012) + // AMD Bulldozer (2011) + sgemm = llamafile_sgemm_amd_avx; + mixmul = llamafile_mixmul_amd_avx; +#endif +#else + // AMD K8/Barcelona (2003-2010) + // Intel Core/Nehalem (2006-2009) + sgemm = llamafile_sgemm_unsupported; + mixmul = llamafile_mixmul_unsupported; +#endif + +#elif defined(__aarch64__) +// long hwcap = getauxval(AT_HWCAP); +// if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) +// (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) +// (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) +// // e.g. Apple M1, Raspberry Pi 5 +// sgemm = llamafile_sgemm_arm82; +// mixmul = llamafile_mixmul_arm82; +// iqk_mixmul = iqk_mul_mat_moe_arm82; +// } else { + // ARM64 baseline ISA + sgemm = llamafile_sgemm_arm80; + mixmul = llamafile_mixmul_arm80; +// } +#else + sgemm = llamafile_sgemm_unsupported; + mixmul = llamafile_mixmul_unsupported; +#endif + } +} funcs; + +/** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @param task is GGML task type + * @param Atype is GGML data type of `A` + * @param Btype is GGML data type of `B` + * @param Ctype is GGML data type of `C` + * @param precision may be used to control the internal compute type + * @return true if this function was able to service the matmul request + */ +bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype, + precision); +} + +/** + * Performs "mixture of experts" tensor multiplication on CPU. + */ +bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) { + return funcs.mixmul(params, weights, thought, plan, result); +} + +bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) { + return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth); +} diff --git a/third_party/llamafile/sgemm_x86.cpp b/third_party/llamafile/sgemm_x86.cpp new file mode 100644 index 0000000..38f6d18 --- /dev/null +++ b/third_party/llamafile/sgemm_x86.cpp @@ -0,0 +1,204 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp +// Copyrigth 2024 Mozilla Foundation. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "sgemm.h" +// #include +// #include +// #include +#include +// #include +#include +// #include "llamafile.h" + +static const struct GemmFuncs { + bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); + bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); + bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); + // typeof(llamafile_sgemm)* sgemm; + // typeof(llamafile_mixmul)* mixmul; + // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; + GemmFuncs() { +#if defined(__x86_64__) || defined(_M_X64) + // if (X86_HAVE(AVX)) { + // if (X86_HAVE(FMA)) { + // if (X86_HAVE(AVX2)) { + // if (X86_HAVE(AVX512F)) { + // if (X86_HAVE(AVX512VL) && // + // X86_HAVE(AVX512BW) && // + // X86_HAVE(AVX512DQ) && // + // X86_HAVE(AVX512_VNNI) && // + // X86_HAVE(AVX512_BF16)) { + // // AMD Zen4+ (2023-) + // sgemm = llamafile_sgemm_amd_zen4; + // mixmul = llamafile_mixmul_amd_zen4; + // iqk_mixmul = iqk_mul_mat_moe_zen4; + // } else { + // // Intel Xeon Skylake+ (2015-) + // sgemm = llamafile_sgemm_amd_avx512f; + // mixmul = llamafile_mixmul_amd_avx512f; + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else if (X86_HAVE(AVXVNNI)) { + // // Intel Alderlake (2021-) + // sgemm = llamafile_sgemm_amd_avxvnni; + // mixmul = llamafile_mixmul_amd_avxvnni; + // iqk_mixmul = iqk_mul_mat_moe; + // } else { + // // Intel Haswell/Broadwell/Skylake (2013-2020) + // // AMD Excavator (2015-2022) + // sgemm = llamafile_sgemm_amd_avx2; + // mixmul = llamafile_mixmul_amd_avx2; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // AMD Piledriver (2011-2014) + // sgemm = llamafile_sgemm_amd_fma; + // mixmul = llamafile_mixmul_amd_fma; + // if (X86_HAVE(F16C)) + // iqk_mixmul = iqk_mul_mat_moe; + // } + // } else { + // // Intel Sandybridge/Ivybridge (2010-2012) + // // AMD Bulldozer (2011) + // sgemm = llamafile_sgemm_amd_avx; + // mixmul = llamafile_mixmul_amd_avx; + // } + // } else { + // // AMD K8/Barcelona (2003-2010) + // // Intel Core/Nehalem (2006-2009) + // sgemm = llamafile_sgemm_unsupported; + // mixmul = llamafile_mixmul_unsupported; + // } + +#if defined(__AVX__) +#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) +#if defined(__AVX2__) +#if defined(__AVX512F__) +#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) + // AMD Zen4+ (2023-) + sgemm = llamafile_sgemm_amd_zen4; + mixmul = llamafile_mixmul_amd_zen4; + iqk_mixmul = iqk_mul_mat_moe_zen4; +#else + // Intel Xeon Skylake+ (2015-) + sgemm = llamafile_sgemm_amd_avx512f; + mixmul = llamafile_mixmul_amd_avx512f; + iqk_mixmul = iqk_mul_mat_moe; +#endif +#elif defined(__AVXVNNI__) + // Intel Alderlake (2021-) + sgemm = llamafile_sgemm_amd_avxvnni; + mixmul = llamafile_mixmul_amd_avxvnni; + iqk_mixmul = iqk_mul_mat_moe; +#else + // Intel Haswell/Broadwell/Skylake (2013-2020) + // AMD Excavator (2015-2022) + sgemm = llamafile_sgemm_amd_avx2; + mixmul = llamafile_mixmul_amd_avx2; +#if defined(__F16C__) + iqk_mixmul = iqk_mul_mat_moe; +#endif +#endif +#else + // AMD Piledriver (2011-2014) + sgemm = llamafile_sgemm_amd_fma; + mixmul = llamafile_mixmul_amd_fma; +#if defined(__F16C__) + iqk_mixmul = iqk_mul_mat_moe; +#endif +#endif +#else + // Intel Sandybridge/Ivybridge (2010-2012) + // AMD Bulldozer (2011) + sgemm = llamafile_sgemm_amd_avx; + mixmul = llamafile_mixmul_amd_avx; +#endif +#else + // AMD K8/Barcelona (2003-2010) + // Intel Core/Nehalem (2006-2009) + sgemm = llamafile_sgemm_unsupported; + mixmul = llamafile_mixmul_unsupported; +#endif + +#elif defined(__aarch64__) + long hwcap = getauxval(AT_HWCAP); + if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) + (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) + (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) + // e.g. Apple M1, Raspberry Pi 5 + sgemm = llamafile_sgemm_arm82; + mixmul = llamafile_mixmul_arm82; + iqk_mixmul = iqk_mul_mat_moe_arm82; + } else { + // ARM64 baseline ISA + sgemm = llamafile_sgemm_arm80; + mixmul = llamafile_mixmul_arm80; + } +#else + sgemm = llamafile_sgemm_unsupported; + mixmul = llamafile_mixmul_unsupported; +#endif + } +} funcs; + +/** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @param task is GGML task type + * @param Atype is GGML data type of `A` + * @param Btype is GGML data type of `B` + * @param Ctype is GGML data type of `C` + * @param precision may be used to control the internal compute type + * @return true if this function was able to service the matmul request + */ +bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype, + precision); +} + +/** + * Performs "mixture of experts" tensor multiplication on CPU. + */ +bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) { + return funcs.mixmul(params, weights, thought, plan, result); +} + +bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) { + return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth); +} diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp index ec36775..cb79878 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp @@ -5,6 +5,7 @@ #ifdef __aarch64__ #define llamafile_mixmul llamafile_mixmul_arm80 +#define iqk_mul_mat iqk_mul_mat_arm80 #include "tinyblas_cpu_mixmul.inc" /** diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc index 9ed8f35..f3b099e 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm.inc +++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc @@ -1,361 +1,7 @@ -// Adapted from -// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc -// Copyrigth 2024 Mozilla Foundation. -// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. - -// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- -// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi -// -// Copyright 2024 Mozilla Foundation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "tinyblas_cpu.h" - -// -// -// ██████╗ ██╗ █████╗ ██████╗ -// ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║ ██╔══██╗██╔═══╝ -// ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║ ███████║██████╗ -// ██║ ██║██▀███║╚███╔╝██╔══██╗██║ ██╔══██║╔═══██║ -// ██║ ██║██║ ██║ ███║ ██████╔╝████╗██║ ██║██████║ -// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝ ╚═╝╚═════╝ -// -// BASIC LINEAR ALGEBRA SUBPROGRAMS -// -// -// This file implements multithreaded CPU matrix multiplication for the -// common contiguous use case C = Aᵀ * B. These kernels are designed to -// have excellent performance[1] for matrices that fit in the CPU cache -// without imposing any overhead such as cache filling or malloc calls. -// -// This implementation does not guarantee any upper bound with rounding -// errors, which grow along with k. Our goal's to maximally exploit the -// hardware for performance, and then use whatever resources remain for -// improving numerical accuracy. -// -// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. -// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. - -namespace { - -template -bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { - switch (Atype) { - case GGML_TYPE_F32: { - if (Btype != GGML_TYPE_F32) - return NOT_SUPPORTED; -#if defined(__AVX512F__) - if (k % 16) - return NOT_SUPPORTED; - tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{ - k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__AVX__) || defined(__AVX2__) - if (k % 8) - return NOT_SUPPORTED; - tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{ - k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__ARM_NEON) - if (k % 4) - return NOT_SUPPORTED; - tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{ - k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; +#if defined(KTRANSFORMERS_USE_NPU) && KTRANSFORMERS_USE_NPU + // 使用 x86 版本 + #include "tinyblas_cpu_sgemm_arm.inc" #else - return NOT_SUPPORTED; -#endif - } - - case GGML_TYPE_BF16: { -#if defined(__AVX512BF16__) - if (k % 32) - return NOT_SUPPORTED; - if (Btype == GGML_TYPE_F32 && n < 2) { - tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_BF16) - return NOT_SUPPORTED; - if (!FLAG_precise) { - tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } else { - tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } -#elif defined(__AVX512F__) - if (k % 16) - return NOT_SUPPORTED; - tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__AVX2__) - if (k % 8) - return NOT_SUPPORTED; - if (Btype != GGML_TYPE_F32) - return NOT_SUPPORTED; - tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - if (k % 4) - return NOT_SUPPORTED; - if (Btype != GGML_TYPE_F32) - return NOT_SUPPORTED; - tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{ - k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#else - return NOT_SUPPORTED; -#endif - } - - case GGML_TYPE_F16: { -#if defined(__AVX512F__) - if (k % 16) - return NOT_SUPPORTED; - if (Btype == GGML_TYPE_F32 && n < 2) { - tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_F16) - return NOT_SUPPORTED; - tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) - // if (X86_CHECK(F16C)) { - if (k % 8) - return NOT_SUPPORTED; - if (Btype == GGML_TYPE_F32 && n < 2) { - tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_F16) - return NOT_SUPPORTED; - tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - // } else { - // return NOT_SUPPORTED; - // } -#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) - if (n < 2 && !FLAG_precise) - // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? - return NOT_SUPPORTED; - if (precision == GGML_PREC_F32) { - if (k % 4) - return NOT_SUPPORTED; - if (Btype != GGML_TYPE_F32) - return NOT_SUPPORTED; - tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } else { - if (k % 8) - return NOT_SUPPORTED; - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_F16) - return NOT_SUPPORTED; - tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; - } -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - if (n < 2 && !FLAG_precise) - // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? - return NOT_SUPPORTED; - if (k % 4) - return NOT_SUPPORTED; - if (Btype != GGML_TYPE_F32) - return NOT_SUPPORTED; - tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ - k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#else - return NOT_SUPPORTED; -#endif - } - - case GGML_TYPE_Q8_0: { - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_Q8_0) - return NOT_SUPPORTED; -#if defined(__AVX2__) || defined(__AVX512F__) - tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{ - k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{ - k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#else - return NOT_SUPPORTED; -#endif - } - - case GGML_TYPE_Q4_0: { - if (Btype == GGML_TYPE_F32) - return WANT_QUANTIZATION; - if (Btype != GGML_TYPE_Q8_0) - return NOT_SUPPORTED; -#if defined(__AVX2__) || defined(__AVX512F__) - tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{ - k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{ - k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; - tb.matmul(m, n, task); - return true; -#else - return NOT_SUPPORTED; -#endif - } - - default: - return NOT_SUPPORTED; - } - - (void)m; - (void)n; - (void)k; - (void)A; - (void)lda; - (void)B; - (void)ldb; - (void)C; - (void)ldc; - (void)ith; - (void)nth; - (void)Atype; - (void)Btype; - (void)precision; -} - -} // namespace - -/** - * Performs optimized matrix multiplication on CPU. - * - * This subroutine may compute C = Aᵀ * B with column major ordering. - * Despite its name, this isn't a generalized implementation. Work is - * only performed when a handwritten kernel is written and available. - * Otherwise the caller should fall back to a general matmul routine. - * - * For example, for single-threaded single-precision GEMM you can say - * - * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, - * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, - * GGML_PREC_DEFAULT); - * - * @param m is rows in `A` and `C` - * @param n is cols in `B` and `C` - * @param k is cols in `A` and rows in `B` - * @param A is first input matrix (always transposed) - * @param lda is row stride of `A` - * @param B is second input matrix (never transposed) - * @param ldb is row stride of `B` - * @param C is input/output array of output matrices - * @param ldc is row stride of `C` - * @param ith is thread id (must be less than `nth`) - * @param nth is number of threads (must be greater than zero) - * @param Atype is GGML data type of `A` - * @param Btype is GGML data type of `B` - * @param Ctype is GGML data type of `C` - * @param precision may be used to control the internal compute type - * @return true if this function was able to service the matmul request - */ -bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { - assert(m >= 0); - assert(n >= 0); - assert(k >= 0); - assert(lda >= k); - assert(ldb >= k); - assert(ldc >= m); - assert(nth > 0); - assert(ith < nth); - -#if QK_K == 256 -#if defined(__x86_64__) || defined(_M_X64) -#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))) - /* - moonll - more Btype accept - }*/ - - if (Ctype == GGML_TYPE_F32){ - if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) { - return true; - } - } - -#endif -#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER - if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { - if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) { - return true; - } - } - if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { - // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32); - assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32)); - if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) { - return true; - } - } -#endif -#endif - - switch (Ctype) { - case GGML_TYPE_F32: - return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype, - Btype, Ctype, precision); - default: - return NOT_SUPPORTED; - } -} + // 使用 ARM 版本 + #include "tinyblas_cpu_sgemm_x86.inc" +#endif \ No newline at end of file diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_arm.inc b/third_party/llamafile/tinyblas_cpu_sgemm_arm.inc new file mode 100644 index 0000000..697e208 --- /dev/null +++ b/third_party/llamafile/tinyblas_cpu_sgemm_arm.inc @@ -0,0 +1,471 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc +// Copyrigth 2024 Mozilla Foundation. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tinyblas_cpu.h" +#include +#include +#include +// +// +// ██████╗ ██╗ █████╗ ██████╗ +// ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║ ██╔══██╗██╔═══╝ +// ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║ ███████║██████╗ +// ██║ ██║██▀███║╚███╔╝██╔══██╗██║ ██╔══██║╔═══██║ +// ██║ ██║██║ ██║ ███║ ██████╔╝████╗██║ ██║██████║ +// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝ ╚═╝╚═════╝ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + +namespace { + +template +void SgemmHelperN1Neon2(long m, long n, long k, const float16_t* A, long lda, const float16_t* B, long ldb, + TC* C, long ldc, int ith, int nth) { + // A m * k B n * k c n * m + const long NVL = 8; + long kk = k / (NVL * 4); + kk = kk * (NVL * 4); + long length = (m / nth) + (ith < (m % nth) ? 1 : 0); + long startRow = ith * (m / nth) + (ith < (m % nth) ? ith : (m % nth)); + long endRow = startRow + length; + for (long i = startRow; i < endRow; i ++) { + const float16_t* tA = A + i * lda; + float32x4_t c0 = vdupq_n_f32(0); + float32x4_t c1 = vdupq_n_f32(0); + float32x4_t c2 = vdupq_n_f32(0); + float32x4_t c3 = vdupq_n_f32(0); + float32x4_t c4 = vdupq_n_f32(0); + float32x4_t c5 = vdupq_n_f32(0); + float32x4_t c6 = vdupq_n_f32(0); + float32x4_t c7 = vdupq_n_f32(0); + for (long j = 0; j < kk; j += NVL * 4) { + __builtin_prefetch(tA + 192, 0, 0); + float16x8_t a0 = vld1q_f16(tA + j); + float16x8_t b0 = vld1q_f16(B + j); + c0 = vfmlalq_low_f16(c0, a0, b0); + c1 = vfmlalq_high_f16(c1, a0, b0); + float16x8_t a1 = vld1q_f16(tA + j + NVL); + float16x8_t b1 = vld1q_f16(B + j + NVL); + c2 = vfmlalq_low_f16(c2, a1, b1); + c3 = vfmlalq_high_f16(c3, a1, b1); + float16x8_t a2 = vld1q_f16(tA + j + NVL * 2); + float16x8_t b2 = vld1q_f16(B + j + NVL * 2); + c4 = vfmlalq_low_f16(c4, a2, b2); + c5 = vfmlalq_high_f16(c5, a2, b2); + float16x8_t a3 = vld1q_f16(tA + j + NVL * 3); + float16x8_t b3 = vld1q_f16(B + j + NVL * 3); + c6 = vfmlalq_low_f16(c6, a3, b3); + c7 = vfmlalq_high_f16(c7, a3, b3); + } + if (k - kk >= NVL * 2) { + float16x8_t a0 = vld1q_f16(tA + kk); + float16x8_t b0 = vld1q_f16(B + kk); + c0 = vfmlalq_low_f16(c0, a0, b0); + c1 = vfmlalq_high_f16(c1, a0, b0); + float16x8_t a1 = vld1q_f16(tA + kk + NVL); + float16x8_t b1 = vld1q_f16(B + kk + NVL); + c2 = vfmlalq_low_f16(c2, a1, b1); + c3 = vfmlalq_high_f16(c3, a1, b1); + kk += NVL * 2; + } + if (k - kk >= NVL) { + float16x8_t a = vld1q_f16(tA + kk); + float16x8_t b = vld1q_f16(B + kk); + c0 = vfmlalq_low_f16(c0, a, b); + c1 = vfmlalq_high_f16(c1, a, b); + kk += NVL; + } + TC sum = 0.0f; + for (long j = kk; j < k; j ++) { + sum += (float32_t)tA[j] * (float32_t)B[j]; + } + c0 = vaddq_f32(c0, c1); + c2 = vaddq_f32(c2, c3); + c4 = vaddq_f32(c4, c5); + c6 = vaddq_f32(c6, c7); + c0 = vaddq_f32(c0, c2); + c4 = vaddq_f32(c4, c6); + sum += vaddvq_f32(c0) + vaddvq_f32(c4); + C[i] = sum; + } + return; +} + +template +void SgemmHelperN1(long m, long n, long k, const ggml_fp16_t* A_, long lda, const ggml_fp16_t* B_, long ldb, + TC* C, long ldc, int ith, int nth) { + // A m * k B n * k c n * m + float16_t *A = (float16_t*)A_; + float16_t *B = (float16_t*)B_; + long rowsPerThread = m / nth; + long startRow = ith * rowsPerThread; + long endRow = (ith == nth - 1) ? m : startRow + rowsPerThread; + for (long i = startRow; i < endRow; i ++) { + TC sum = 0.0f; + for (long j = 0; j < k; j ++) { + sum += (float32_t)A[i * lda + j] * (float32_t)B[j]; + } + C[i] = sum; + } + return; +} + +template +bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + switch (Atype) { + case GGML_TYPE_F32: { + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; +#if defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__AVX__) || defined(__AVX2__) + if (k % 8) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) + if (k % 4) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_BF16: { +#if defined(__AVX512BF16__) + if (k % 32) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_BF16) + return NOT_SUPPORTED; + if (!FLAG_precise) { + tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } else { + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } +#elif defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__AVX2__) + if (k % 8) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_F16: { +#if defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + // if (X86_CHECK(F16C)) { + if (k % 8) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + // } else { + // return NOT_SUPPORTED; + // } +#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) + if (n < 2 && !FLAG_precise) { + // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? + if (Btype == GGML_TYPE_F16 && task == GGML_TASK_TYPE_COMPUTE) { + SgemmHelperN1Neon2(m, n, k, (const float16_t*)A, lda, (const float16_t*)B, ldb, C, ldc, ith, nth); + return true; + } + return NOT_SUPPORTED; + } + if (precision == GGML_PREC_F32) { + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } else { + if (k % 8) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + if (n < 2 && !FLAG_precise) { + // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? + if (Btype == GGML_TYPE_F16 && task == GGML_TASK_TYPE_COMPUTE) { + SgemmHelperN1Neon2(m, n, k, (const float16_t*)A, lda, (const float16_t*)B, ldb, C, ldc, ith, nth); + return true; + } + return NOT_SUPPORTED; + } + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_Q8_0: { + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_Q8_0) + return NOT_SUPPORTED; +#if defined(__AVX2__) || defined(__AVX512F__) + tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{ + k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{ + k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_Q4_0: { + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_Q8_0) + return NOT_SUPPORTED; +#if defined(__AVX2__) || defined(__AVX512F__) + tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{ + k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{ + k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + default: + return NOT_SUPPORTED; + } + + (void)m; + (void)n; + (void)k; + (void)A; + (void)lda; + (void)B; + (void)ldb; + (void)C; + (void)ldc; + (void)ith; + (void)nth; + (void)Atype; + (void)Btype; + (void)precision; +} + +} // namespace + +/** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * For example, for single-threaded single-precision GEMM you can say + * + * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, + * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, + * GGML_PREC_DEFAULT); + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @param Atype is GGML data type of `A` + * @param Btype is GGML data type of `B` + * @param Ctype is GGML data type of `C` + * @param precision may be used to control the internal compute type + * @return true if this function was able to service the matmul request + */ +bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); + +#if QK_K == 256 +#if defined(__x86_64__) || defined(_M_X64) +#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))) + /* + moonll + more Btype accept + }*/ + // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) { + if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32){ + if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) { + return true; + } + } + if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { + assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32)); + if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) { + return true; + } + } + +#endif +#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER + if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { + if (iqk_mul_mat(m, n, k * QK_K, Atype, A, k, Btype, B, k, (float*)C, ldc, ith, nth)) { + return true; + } + } + if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { + // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32); + assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32)); + if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, k, Btype, B, k, (float*)C, ldc, ith, nth)) { + return true; + } + } +#endif +#endif + + switch (Ctype) { + case GGML_TYPE_F32: + return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype, + Btype, Ctype, precision); + default: + return NOT_SUPPORTED; + } +} \ No newline at end of file diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_x86.inc b/third_party/llamafile/tinyblas_cpu_sgemm_x86.inc new file mode 100644 index 0000000..9ed8f35 --- /dev/null +++ b/third_party/llamafile/tinyblas_cpu_sgemm_x86.inc @@ -0,0 +1,361 @@ +// Adapted from +// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc +// Copyrigth 2024 Mozilla Foundation. +// Copyright(c) 2024 by KVCache.AI, All Rights Reserved. + +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tinyblas_cpu.h" + +// +// +// ██████╗ ██╗ █████╗ ██████╗ +// ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║ ██╔══██╗██╔═══╝ +// ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║ ███████║██████╗ +// ██║ ██║██▀███║╚███╔╝██╔══██╗██║ ██╔══██║╔═══██║ +// ██║ ██║██║ ██║ ███║ ██████╔╝████╗██║ ██║██████║ +// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝ ╚═╝╚═════╝ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + +namespace { + +template +bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + switch (Atype) { + case GGML_TYPE_F32: { + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; +#if defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__AVX__) || defined(__AVX2__) + if (k % 8) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) + if (k % 4) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{ + k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_BF16: { +#if defined(__AVX512BF16__) + if (k % 32) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_BF16) + return NOT_SUPPORTED; + if (!FLAG_precise) { + tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } else { + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } +#elif defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__AVX2__) + if (k % 8) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{ + k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_F16: { +#if defined(__AVX512F__) + if (k % 16) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + // if (X86_CHECK(F16C)) { + if (k % 8) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32 && n < 2) { + tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + // } else { + // return NOT_SUPPORTED; + // } +#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) + if (n < 2 && !FLAG_precise) + // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? + return NOT_SUPPORTED; + if (precision == GGML_PREC_F32) { + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } else { + if (k % 8) + return NOT_SUPPORTED; + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_F16) + return NOT_SUPPORTED; + tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; + } +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + if (n < 2 && !FLAG_precise) + // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec? + return NOT_SUPPORTED; + if (k % 4) + return NOT_SUPPORTED; + if (Btype != GGML_TYPE_F32) + return NOT_SUPPORTED; + tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{ + k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_Q8_0: { + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_Q8_0) + return NOT_SUPPORTED; +#if defined(__AVX2__) || defined(__AVX512F__) + tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{ + k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{ + k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + case GGML_TYPE_Q4_0: { + if (Btype == GGML_TYPE_F32) + return WANT_QUANTIZATION; + if (Btype != GGML_TYPE_Q8_0) + return NOT_SUPPORTED; +#if defined(__AVX2__) || defined(__AVX512F__) + tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{ + k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{ + k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return NOT_SUPPORTED; +#endif + } + + default: + return NOT_SUPPORTED; + } + + (void)m; + (void)n; + (void)k; + (void)A; + (void)lda; + (void)B; + (void)ldb; + (void)C; + (void)ldc; + (void)ith; + (void)nth; + (void)Atype; + (void)Btype; + (void)precision; +} + +} // namespace + +/** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * For example, for single-threaded single-precision GEMM you can say + * + * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, + * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, + * GGML_PREC_DEFAULT); + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @param Atype is GGML data type of `A` + * @param Btype is GGML data type of `B` + * @param Ctype is GGML data type of `C` + * @param precision may be used to control the internal compute type + * @return true if this function was able to service the matmul request + */ +bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); + +#if QK_K == 256 +#if defined(__x86_64__) || defined(_M_X64) +#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))) + /* + moonll + more Btype accept + }*/ + + if (Ctype == GGML_TYPE_F32){ + if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) { + return true; + } + } + +#endif +#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER + if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { + if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) { + return true; + } + } + if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) { + // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32); + assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32)); + if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) { + return true; + } + } +#endif +#endif + + switch (Ctype) { + case GGML_TYPE_F32: + return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype, + Btype, Ctype, precision); + default: + return NOT_SUPPORTED; + } +}