mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
temp checkpoint
This commit is contained in:
commit
ec95241e38
23 changed files with 460 additions and 105 deletions
|
@ -129,7 +129,11 @@ static void common_params_handle_model_default(common_params & params) {
|
||||||
}
|
}
|
||||||
params.hf_file = params.model;
|
params.hf_file = params.model;
|
||||||
} else if (params.model.empty()) {
|
} else if (params.model.empty()) {
|
||||||
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||||
|
std::string filename = params.hf_repo + "_" + params.hf_file;
|
||||||
|
// to make sure we don't have any slashes in the filename
|
||||||
|
string_replace_all(filename, "/", "_");
|
||||||
|
params.model = fs_get_cache_file(filename);
|
||||||
}
|
}
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
if (params.model.empty()) {
|
if (params.model.empty()) {
|
||||||
|
@ -1367,8 +1371,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_gpu_layers = value;
|
params.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||||
|
@ -2101,8 +2106,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_gpu_layers = value;
|
params.speculative.n_gpu_layers = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
||||||
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
|
@ -831,9 +831,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
@ -1344,17 +1344,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * model_url,
|
const std::string & model_url,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// Basic validation of the model_url
|
// Basic validation of the model_url
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
if (model_url.empty()) {
|
||||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file(model_url, path_model, hf_token)) {
|
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1365,9 +1365,9 @@ struct llama_model * common_load_model_from_url(
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ NULL,
|
/*.ctx = */ NULL,
|
||||||
};
|
};
|
||||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||||
if (!ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1386,13 +1386,13 @@ struct llama_model * common_load_model_from_url(
|
||||||
// Verify the first split file format
|
// Verify the first split file format
|
||||||
// and extract split URL and PATH prefixes
|
// and extract split URL and PATH prefixes
|
||||||
{
|
{
|
||||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1419,14 +1419,14 @@ struct llama_model * common_load_model_from_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(path_model, params);
|
return llama_load_model_from_file(local_path.c_str(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * repo,
|
const std::string & repo,
|
||||||
const char * model,
|
const std::string & remote_path,
|
||||||
const char * path_model,
|
const std::string & local_path,
|
||||||
const char * hf_token,
|
const std::string & hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// construct hugging face model url:
|
// construct hugging face model url:
|
||||||
//
|
//
|
||||||
|
@ -1440,27 +1440,27 @@ struct llama_model * common_load_model_from_hf(
|
||||||
std::string model_url = "https://huggingface.co/";
|
std::string model_url = "https://huggingface.co/";
|
||||||
model_url += repo;
|
model_url += repo;
|
||||||
model_url += "/resolve/main/";
|
model_url += "/resolve/main/";
|
||||||
model_url += model;
|
model_url += remote_path;
|
||||||
|
|
||||||
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * /*model_url*/,
|
const std::string & /*model_url*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * /*repo*/,
|
const std::string & /*repo*/,
|
||||||
const char * /*model*/,
|
const std::string & /*remote_path*/,
|
||||||
const char * /*path_model*/,
|
const std::string & /*local_path*/,
|
||||||
const char * /*hf_token*/,
|
const std::string & /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -466,8 +466,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * common_load_model_from_url(
|
||||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
const std::string & model_url,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
struct llama_model * common_load_model_from_hf(
|
||||||
|
const std::string & repo,
|
||||||
|
const std::string & remote_path,
|
||||||
|
const std::string & local_path,
|
||||||
|
const std::string & hf_token,
|
||||||
|
const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||||
|
|
|
@ -2,6 +2,6 @@ aiohttp~=3.9.3
|
||||||
pytest~=8.3.3
|
pytest~=8.3.3
|
||||||
huggingface_hub~=0.23.2
|
huggingface_hub~=0.23.2
|
||||||
numpy~=1.26.4
|
numpy~=1.26.4
|
||||||
openai~=1.30.3
|
openai~=1.55.3
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
requests~=2.32.3
|
requests~=2.32.3
|
||||||
|
|
|
@ -8,7 +8,6 @@ import os
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import threading
|
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
@ -161,26 +160,12 @@ class ServerProcess:
|
||||||
self.process = subprocess.Popen(
|
self.process = subprocess.Popen(
|
||||||
[str(arg) for arg in [server_path, *server_args]],
|
[str(arg) for arg in [server_path, *server_args]],
|
||||||
creationflags=flags,
|
creationflags=flags,
|
||||||
stdout=subprocess.PIPE,
|
stdout=sys.stdout,
|
||||||
stderr=subprocess.PIPE,
|
stderr=sys.stdout,
|
||||||
env={**os.environ, "LLAMA_CACHE": "tmp"},
|
env={**os.environ, "LLAMA_CACHE": "tmp"},
|
||||||
)
|
)
|
||||||
server_instances.add(self)
|
server_instances.add(self)
|
||||||
|
|
||||||
def server_log(in_stream, out_stream):
|
|
||||||
for line in iter(in_stream.readline, b""):
|
|
||||||
print(line.decode("utf-8"), end="", file=out_stream)
|
|
||||||
|
|
||||||
thread_stdout = threading.Thread(
|
|
||||||
target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
|
|
||||||
)
|
|
||||||
thread_stdout.start()
|
|
||||||
|
|
||||||
thread_stderr = threading.Thread(
|
|
||||||
target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
|
|
||||||
)
|
|
||||||
thread_stderr.start()
|
|
||||||
|
|
||||||
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
|
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
|
||||||
|
|
||||||
# wait for server to start
|
# wait for server to start
|
||||||
|
@ -319,7 +304,6 @@ class ServerPreset:
|
||||||
server.model_hf_repo = "ggml-org/models"
|
server.model_hf_repo = "ggml-org/models"
|
||||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||||
server.model_alias = "jina-reranker"
|
server.model_alias = "jina-reranker"
|
||||||
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
|
|
||||||
server.n_ctx = 512
|
server.n_ctx = 512
|
||||||
server.n_batch = 512
|
server.n_batch = 512
|
||||||
server.n_slots = 1
|
server.n_slots = 1
|
||||||
|
|
|
@ -91,6 +91,7 @@ extern "C" {
|
||||||
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||||
|
|
|
@ -395,6 +395,9 @@ extern "C" {
|
||||||
GGML_TYPE_Q4_0_8_8 = 33,
|
GGML_TYPE_Q4_0_8_8 = 33,
|
||||||
GGML_TYPE_TQ1_0 = 34,
|
GGML_TYPE_TQ1_0 = 34,
|
||||||
GGML_TYPE_TQ2_0 = 35,
|
GGML_TYPE_TQ2_0 = 35,
|
||||||
|
GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||||
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||||
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||||
}
|
}
|
||||||
case GGML_OP_ROPE: {
|
case GGML_OP_ROPE: {
|
||||||
// TODO: with ops-test v == 1
|
// TODO: with ops-test v == 1
|
||||||
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
|
||||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||||
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
||||||
// TODO: with freq_factors
|
|
||||||
if (op->src[2] != NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// TODO: n_dims <= ne0
|
// TODO: n_dims <= ne0
|
||||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||||
if (*ext_factor != 0) {
|
if (*ext_factor != 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// TODO: freq_scale != 1
|
|
||||||
if (*freq_scale != 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// TODO: attn_factor != 1
|
// TODO: attn_factor != 1
|
||||||
if (*attn_factor != 1) {
|
if (*attn_factor != 1) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -418,6 +418,12 @@ typedef struct {
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||||
|
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
||||||
|
} block_iq4_nlx4;
|
||||||
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
//
|
|
||||||
|
|
||||||
#define GGML_COMMON_IMPL_C
|
#define GGML_COMMON_IMPL_C
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
|
@ -187,6 +183,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
|
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
assert(k % QK8_0 == 0);
|
assert(k % QK8_0 == 0);
|
||||||
|
@ -528,7 +526,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -996,6 +994,102 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
float * res_ptr = s;
|
||||||
|
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
float32x4_t sumf = vdupq_n_f32(0);
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
|
||||||
|
uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
|
||||||
|
uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
|
||||||
|
uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
|
||||||
|
|
||||||
|
int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
|
||||||
|
int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
|
||||||
|
int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
|
||||||
|
int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
|
||||||
|
int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
|
||||||
|
int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
|
||||||
|
int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
|
||||||
|
int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
|
||||||
|
|
||||||
|
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
|
||||||
|
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
|
||||||
|
|
||||||
|
int32x4_t sumi = vdupq_n_s32(0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
|
||||||
|
sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
|
||||||
|
|
||||||
|
float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
|
||||||
|
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
|
||||||
|
float32x4_t d = a_d * b_d;
|
||||||
|
|
||||||
|
sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
|
||||||
|
}
|
||||||
|
|
||||||
|
vst1q_f32(res_ptr + x * 4, sumf);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
{
|
||||||
|
float sumf[4];
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||||
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||||
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||||
|
}
|
||||||
|
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
@ -1017,7 +1111,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -3386,6 +3480,117 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
|
const int qk = QK8_0;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 4;
|
||||||
|
const int blocklen = 4;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nr % 4 == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||||
|
|
||||||
|
float32x4_t sumf[4];
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
sumf[m] = vdupq_n_f32(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
|
||||||
|
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
|
||||||
|
|
||||||
|
int32x4_t sumi_0 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_1 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_2 = vdupq_n_s32(0);
|
||||||
|
int32x4_t sumi_3 = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (int k = 0; k < 4; k++) {
|
||||||
|
int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
|
||||||
|
int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
|
||||||
|
|
||||||
|
uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
|
||||||
|
int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
|
||||||
|
int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
|
||||||
|
|
||||||
|
sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
|
||||||
|
sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
|
||||||
|
sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
|
||||||
|
sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
|
||||||
|
sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
|
||||||
|
sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
|
||||||
|
sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
|
||||||
|
sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
|
||||||
|
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
|
||||||
|
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
|
||||||
|
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||||
|
{
|
||||||
|
float sumf[4][4];
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||||
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||||
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||||
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||||
|
}
|
||||||
|
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++)
|
||||||
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: this code is duplicated from ggml-aarch64.c
|
// FIXME: this code is duplicated from ggml-aarch64.c
|
||||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||||
block_q4_0x4 out;
|
block_q4_0x4 out;
|
||||||
|
@ -3518,6 +3723,70 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||||
|
block_iq4_nlx4 out;
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
out.d[i] = in[i].d;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int end = QK4_NL * 2 / blck_size_interleave;
|
||||||
|
|
||||||
|
if (blck_size_interleave == 8) {
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 4;
|
||||||
|
int src_offset = (i / 4) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
// Using memcpy to avoid unaligned memory accesses
|
||||||
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||||
|
}
|
||||||
|
} else if (blck_size_interleave == 4) {
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 4;
|
||||||
|
int src_offset = (i / 4) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||||
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||||
|
|
||||||
|
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||||
|
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||||
|
block_iq4_nl dst_tmp[4];
|
||||||
|
int nrow = t->ne[1]; // Number of rows
|
||||||
|
int nrows_interleaved = 4;
|
||||||
|
int nblocks = t->ne[0] / QK4_0;
|
||||||
|
|
||||||
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||||
|
|
||||||
|
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||||
|
for (int64_t x = 0; x < nblocks; x++) {
|
||||||
|
for (int i = 0; i < nrows_interleaved; i++) {
|
||||||
|
dst_tmp[i] = src[x + i * nblocks];
|
||||||
|
}
|
||||||
|
*dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
|
||||||
|
}
|
||||||
|
src += nrows_interleaved * nblocks;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(data_size);
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare for optimized kernels if applicable
|
// Prepare for optimized kernels if applicable
|
||||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
|
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
|
||||||
if (cur->type == repack_type) {
|
if (cur->type == repack_type) {
|
||||||
|
@ -3525,20 +3794,30 @@ void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
|
switch (repack_type) {
|
||||||
switch (repack_type) {
|
case GGML_TYPE_Q4_0_8_8:
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
|
||||||
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
|
break;
|
||||||
break;
|
case GGML_TYPE_Q4_0_4_8:
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
|
||||||
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
|
break;
|
||||||
break;
|
case GGML_TYPE_Q4_0_4_4:
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
|
||||||
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
|
break;
|
||||||
break;
|
default:
|
||||||
default:
|
GGML_ABORT("Unsupported type");
|
||||||
GGML_ABORT("Unsupported type");
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
|
switch (repack_type) {
|
||||||
|
case GGML_TYPE_IQ4_NL_4_4:
|
||||||
|
repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("Unsupported type");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("Unsupported type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3551,9 +3830,13 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
return GGML_TYPE_Q4_0_4_8;
|
return GGML_TYPE_Q4_0_4_8;
|
||||||
}
|
}
|
||||||
if (ggml_cpu_has_neon()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
return GGML_TYPE_Q4_0_4_4;
|
return GGML_TYPE_Q4_0_4_4;
|
||||||
}
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
return GGML_TYPE_IQ4_NL_4_4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur->type;
|
return cur->type;
|
||||||
|
|
|
@ -15,11 +15,13 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
// GEMM
|
// GEMM
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
||||||
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
||||||
|
|
|
@ -1814,11 +1814,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
l1, r1)), l2, r2)), l3, r3))), scale);
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
}
|
}
|
||||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
|
||||||
|
float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
|
||||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
vst1_f32(s, vget_low_f32(sumv2));
|
vst1_f32(s, vget_low_f32 (sumv2));
|
||||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -112,10 +112,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||||
#if defined(__ARM_ARCH)
|
#if defined(__ARM_ARCH)
|
||||||
struct ggml_arm_arch_features_type {
|
struct ggml_arm_arch_features_type {
|
||||||
int has_neon;
|
int has_neon;
|
||||||
|
int has_dotprod;
|
||||||
int has_i8mm;
|
int has_i8mm;
|
||||||
int has_sve;
|
int has_sve;
|
||||||
int sve_cnt;
|
int sve_cnt;
|
||||||
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -449,6 +450,15 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_IQ4_NL_4_4] = {
|
||||||
|
.from_float = NULL,
|
||||||
|
.vec_dot = NULL,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
|
.ncols = 4,
|
||||||
|
.gemv = ggml_gemv_iq4_nl_4x4_q8_0,
|
||||||
|
.gemm = ggml_gemm_iq4_nl_4x4_q8_0,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
|
||||||
|
@ -2443,6 +2453,7 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||||
|
|
||||||
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
||||||
|
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
|
||||||
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||||
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
||||||
|
|
||||||
|
@ -2457,6 +2468,11 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
}
|
}
|
||||||
ggml_arm_arch_features.has_neon = oldp;
|
ggml_arm_arch_features.has_neon = oldp;
|
||||||
|
|
||||||
|
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
||||||
|
oldp = 0;
|
||||||
|
}
|
||||||
|
ggml_arm_arch_features.has_dotprod = oldp;
|
||||||
|
|
||||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
||||||
oldp = 0;
|
oldp = 0;
|
||||||
}
|
}
|
||||||
|
@ -7596,14 +7612,6 @@ UseGgmlGemm2:;
|
||||||
// This is the size of the rest of the dimensions of the result
|
// This is the size of the rest of the dimensions of the result
|
||||||
const int64_t nr1 = ne1 * ne2 * ne3;
|
const int64_t nr1 = ne1 * ne2 * ne3;
|
||||||
|
|
||||||
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
|
||||||
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
|
||||||
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
|
||||||
// this check can be removed once they are extended to support odd numbered rows/cols too
|
|
||||||
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
|
||||||
num_rows_per_vec_dot = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now select a reasonable chunk size.
|
// Now select a reasonable chunk size.
|
||||||
int chunk_size = 16;
|
int chunk_size = 16;
|
||||||
|
|
||||||
|
@ -7666,6 +7674,15 @@ UseGgmlGemm2:;
|
||||||
const int64_t ir1_start = dr1 * ith1;
|
const int64_t ir1_start = dr1 * ith1;
|
||||||
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
||||||
|
|
||||||
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||||
|
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
||||||
|
|
||||||
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||||
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||||
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
|
||||||
|
num_rows_per_vec_dot = 1;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
||||||
|
|
||||||
if (nth >= nchunk0 * nchunk1) {
|
if (nth >= nchunk0 * nchunk1) {
|
||||||
|
@ -9169,6 +9186,7 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q4_0_4_4:
|
case GGML_TYPE_Q4_0_4_4:
|
||||||
case GGML_TYPE_Q4_0_4_8:
|
case GGML_TYPE_Q4_0_4_8:
|
||||||
case GGML_TYPE_Q4_0_8_8:
|
case GGML_TYPE_Q4_0_8_8:
|
||||||
|
case GGML_TYPE_IQ4_NL_4_4:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -13921,6 +13939,14 @@ int ggml_cpu_has_neon(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_dotprod(void) {
|
||||||
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
return ggml_arm_arch_features.has_dotprod;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_sve(void) {
|
int ggml_cpu_has_sve(void) {
|
||||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
||||||
return ggml_arm_arch_features.has_sve;
|
return ggml_arm_arch_features.has_sve;
|
||||||
|
|
|
@ -457,7 +457,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||||
const struct ggml_tensor * src1 = op->src[1];
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
|
||||||
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
||||||
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
|
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,9 +47,20 @@
|
||||||
#define CC_TURING 750
|
#define CC_TURING 750
|
||||||
#define CC_AMPERE 800
|
#define CC_AMPERE 800
|
||||||
#define CC_OFFSET_AMD 1000000
|
#define CC_OFFSET_AMD 1000000
|
||||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
// GCN/CNDA, wave size is 64
|
||||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||||
|
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||||
|
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||||
|
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||||
|
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||||
|
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
|
||||||
|
|
||||||
|
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||||
|
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
|
||||||
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||||
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||||
|
|
||||||
#define CC_QY1 210
|
#define CC_QY1 210
|
||||||
#define CC_QY2 220
|
#define CC_QY2 220
|
||||||
|
|
||||||
|
|
|
@ -1108,6 +1108,11 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
const half alpha_f16 = 1.0f;
|
const half alpha_f16 = 1.0f;
|
||||||
const half beta_f16 = 0.0f;
|
const half beta_f16 = 0.0f;
|
||||||
|
|
||||||
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
|
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||||
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||||
|
}
|
||||||
|
|
||||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
|
@ -1115,7 +1120,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||||
src1_ptr, CUDA_R_16F, ne10,
|
src1_ptr, CUDA_R_16F, ne10,
|
||||||
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||||
CUBLAS_COMPUTE_16F,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
|
@ -1608,6 +1613,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||||
|
|
||||||
|
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||||
|
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||||
|
}
|
||||||
|
|
||||||
// dst strides
|
// dst strides
|
||||||
size_t nbd2 = dst->nb[2];
|
size_t nbd2 = dst->nb[2];
|
||||||
size_t nbd3 = dst->nb[3];
|
size_t nbd3 = dst->nb[3];
|
||||||
|
|
|
@ -146,5 +146,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2571,9 +2571,9 @@ static __device__ void mul_mat_q_process_tile(
|
||||||
|
|
||||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA3) || defined(RDNA2)
|
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // defined(RDNA3) || defined(RDNA2)
|
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||||
#else
|
#else
|
||||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||||
|
|
|
@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
||||||
int64_t nwarps = 1;
|
int64_t nwarps = 1;
|
||||||
int64_t rows_per_cuda_block = 1;
|
int64_t rows_per_cuda_block = 1;
|
||||||
|
|
||||||
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||||
switch(ncols_y) {
|
switch(ncols_y) {
|
||||||
case 1:
|
case 1:
|
||||||
nwarps = 4;
|
nwarps = 4;
|
||||||
|
|
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
|
@ -95,6 +95,14 @@
|
||||||
|
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
|
||||||
|
#define GCN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||||
|
#define CDNA
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
defined(__gfx1150__) || defined(__gfx1151__)
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
#define RDNA3
|
#define RDNA3
|
||||||
|
|
|
@ -997,9 +997,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_RMS_NORM:
|
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
return has_simdgroup_reduction;
|
return has_simdgroup_reduction;
|
||||||
|
case GGML_OP_RMS_NORM:
|
||||||
|
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
return true;
|
return true;
|
||||||
|
@ -2672,7 +2673,6 @@ static void ggml_metal_encode_node(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
|
|
|
@ -832,6 +832,15 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
||||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_IQ4_NL_4_4] = {
|
||||||
|
.type_name = "iq4_nl_4x4",
|
||||||
|
.blck_size = QK4_NL,
|
||||||
|
.blck_size_interleave = 4,
|
||||||
|
.type_size = sizeof(block_iq4_nl),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = NULL,
|
||||||
|
.from_float_ref = NULL,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||||
|
|
|
@ -2359,6 +2359,7 @@ enum e_model {
|
||||||
MODEL_16B,
|
MODEL_16B,
|
||||||
MODEL_20B,
|
MODEL_20B,
|
||||||
MODEL_30B,
|
MODEL_30B,
|
||||||
|
MODEL_32B,
|
||||||
MODEL_34B,
|
MODEL_34B,
|
||||||
MODEL_35B,
|
MODEL_35B,
|
||||||
MODEL_40B,
|
MODEL_40B,
|
||||||
|
@ -5362,6 +5363,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_16B: return "16B";
|
case MODEL_16B: return "16B";
|
||||||
case MODEL_20B: return "20B";
|
case MODEL_20B: return "20B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B: return "30B";
|
||||||
|
case MODEL_32B: return "32B";
|
||||||
case MODEL_34B: return "34B";
|
case MODEL_34B: return "34B";
|
||||||
case MODEL_35B: return "35B";
|
case MODEL_35B: return "35B";
|
||||||
case MODEL_40B: return "40B";
|
case MODEL_40B: return "40B";
|
||||||
|
@ -5722,7 +5724,10 @@ static void llm_load_hparams(
|
||||||
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
||||||
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 36: model.type = e_model::MODEL_3B; break;
|
||||||
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
||||||
|
case 48: model.type = e_model::MODEL_14B; break;
|
||||||
|
case 64: model.type = e_model::MODEL_32B; break;
|
||||||
case 80: model.type = e_model::MODEL_70B; break;
|
case 80: model.type = e_model::MODEL_70B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue