Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	.github/workflows/editorconfig.yml
#	examples/run/run.cpp
#	examples/server/README.md
#	scripts/sync-ggml.last
This commit is contained in:
Concedo 2025-01-09 16:50:29 +08:00
commit 1b49dc305f
18 changed files with 888 additions and 107 deletions

View file

@ -23,6 +23,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
return *this; return *this;
} }
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
this->excludes = std::move(excludes);
return *this;
}
common_arg & common_arg::set_env(const char * env) { common_arg & common_arg::set_env(const char * env) {
help = help + "\n(env: " + env + ")"; help = help + "\n(env: " + env + ")";
this->env = env; this->env = env;
@ -38,6 +43,10 @@ bool common_arg::in_example(enum llama_example ex) {
return examples.find(ex) != examples.end(); return examples.find(ex) != examples.end();
} }
bool common_arg::is_exclude(enum llama_example ex) {
return excludes.find(ex) != excludes.end();
}
bool common_arg::get_value_from_env(std::string & output) { bool common_arg::get_value_from_env(std::string & output) {
if (env == nullptr) return false; if (env == nullptr) return false;
char * value = std::getenv(env); char * value = std::getenv(env);
@ -421,7 +430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
*/ */
auto add_opt = [&](common_arg arg) { auto add_opt = [&](common_arg arg) {
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
ctx_arg.options.push_back(std::move(arg)); ctx_arg.options.push_back(std::move(arg));
} }
}; };
@ -650,7 +659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.prompt = value; params.prompt = value;
} }
)); ).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--no-perf"}, {"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@ -674,7 +683,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.prompt.pop_back(); params.prompt.pop_back();
} }
} }
)); ).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--in-file"}, "FNAME", {"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)", "an input file (repeat to specify multiple files)",
@ -701,7 +710,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.prompt = ss.str(); params.prompt = ss.str();
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
} }
)); ).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-e", "--escape"}, {"-e", "--escape"},
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),

View file

@ -12,6 +12,7 @@
struct common_arg { struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {};
std::vector<const char *> args; std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
@ -53,9 +54,11 @@ struct common_arg {
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
common_arg & set_examples(std::initializer_list<enum llama_example> examples); common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
common_arg & set_env(const char * env); common_arg & set_env(const char * env);
common_arg & set_sparam(); common_arg & set_sparam();
bool in_example(enum llama_example ex); bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output); bool get_value_from_env(std::string & output);
bool has_value_from_env(); bool has_value_from_env();
std::string to_string(); std::string to_string();

View file

@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
"--base", type=Path, "--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
) )
parser.add_argument(
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument( parser.add_argument(
"lora_path", type=Path, "lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@ -290,6 +297,7 @@ if __name__ == '__main__':
dir_base_model: Path | None = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -313,7 +321,10 @@ if __name__ == '__main__':
lparams: dict[str, Any] = json.load(f) lparams: dict[str, Any] = json.load(f)
# load base model # load base model
if dir_base_model is None: if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams: if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"] model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}") logger.info(f"Loading base model from Hugging Face: {model_id}")
@ -371,11 +382,16 @@ if __name__ == '__main__':
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name # note: mergekit-extract-lora also adds token embeddings to the adapter
is_lora_b = ".lora_B.weight" in name is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name: if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@ -407,9 +423,21 @@ if __name__ == '__main__':
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model") raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)

View file

@ -575,4 +575,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
if (backend_path) {
ggml_backend_load(backend_path);
}
} }

View file

@ -54,6 +54,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include <atomic> #include <atomic>
#include <array>
#ifdef _MSC_VER #ifdef _MSC_VER
#define NOINLINE __declspec(noinline) #define NOINLINE __declspec(noinline)
@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX {
} \ } \
} \ } \
template <typename TA, typename TB, typename TC>
class tinyBLAS_Q0_PPC {
public:
tinyBLAS_Q0_PPC(int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
}
void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
}
private:
template<int RM, int RN>
inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
for (int I = 0; I < RM; I++) {
for (int J = 0; J < RN; J++) {
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
}
}
}
template<int size>
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
vector signed int vec_C[4];
vector float CA[4] = {0};
vector float res[4] = {0};
__builtin_mma_disassemble_acc(vec_C, ACC);
for (int i = 0; i < 4; i++) {
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
}
}
template<typename VA, typename VB>
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
int64_t i, j;
TA *aoffset = NULL;
VA *vecOffset = NULL;
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
VB t1, t2, t3, t4, t5, t6, t7, t8;
vector unsigned char xor_vector;
uint8_t flip_vec = 0x80;
xor_vector = vec_splats(flip_vec);
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
aoffset = const_cast<TA*>(a);
vecOffset = vec;
j = (rows >> 3);
if (j > 0) {
do {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;
i = (cols >> 3);
if (i > 0) {
do {
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
__builtin_vsx_disassemble_pair(c1, &C1);
__builtin_vsx_disassemble_pair(c2, &C2);
__builtin_vsx_disassemble_pair(c3, &C3);
__builtin_vsx_disassemble_pair(c4, &C4);
__builtin_vsx_disassemble_pair(c5, &C5);
__builtin_vsx_disassemble_pair(c6, &C6);
__builtin_vsx_disassemble_pair(c7, &C7);
__builtin_vsx_disassemble_pair(c8, &C8);
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
t1 = vec_perm(c5[0], c6[0], swiz1);
t2 = vec_perm(c5[0], c6[0], swiz2);
t3 = vec_perm(c7[0], c8[0], swiz1);
t4 = vec_perm(c7[0], c8[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+128);
vec_xst(t6, 0, vecOffset+144);
vec_xst(t7, 0, vecOffset+160);
vec_xst(t8, 0, vecOffset+176);
t1 = vec_perm(c5[1], c6[1], swiz1);
t2 = vec_perm(c5[1], c6[1], swiz2);
t3 = vec_perm(c7[1], c8[1], swiz1);
t4 = vec_perm(c7[1], c8[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+192);
vec_xst(t6, 0, vecOffset+208);
vec_xst(t7, 0, vecOffset+224);
vec_xst(t8, 0, vecOffset+240);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
aoffset4 += lda;
aoffset5 += lda;
aoffset6 += lda;
aoffset7 += lda;
aoffset8 += lda;
vecOffset += 256;
i--;
} while(i > 0);
}
j--;
} while(j > 0);
}
if (rows & 4) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;
i = (cols >> 3);
if (i > 0) {
do {
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
__builtin_vsx_disassemble_pair(c1, &C1);
__builtin_vsx_disassemble_pair(c2, &C2);
__builtin_vsx_disassemble_pair(c3, &C3);
__builtin_vsx_disassemble_pair(c4, &C4);
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
aoffset4 += lda;
vecOffset += 128;
i--;
} while(i > 0);
}
}
if (rows & 3) {
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
i = (cols >> 3);
if (i > 0) {
do {
switch(rows) {
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
__builtin_vsx_disassemble_pair(c3, &C3);
case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
__builtin_vsx_disassemble_pair(c2, &C2);
case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
__builtin_vsx_disassemble_pair(c1, &C1);
break;
}
t1 = vec_perm(c1[0], c2[0], swiz1);
t2 = vec_perm(c1[0], c2[0], swiz2);
t3 = vec_perm(c3[0], c4[0], swiz1);
t4 = vec_perm(c3[0], c4[0], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset);
vec_xst(t6, 0, vecOffset+16);
vec_xst(t7, 0, vecOffset+32);
vec_xst(t8, 0, vecOffset+48);
t1 = vec_perm(c1[1], c2[1], swiz1);
t2 = vec_perm(c1[1], c2[1], swiz2);
t3 = vec_perm(c3[1], c4[1], swiz1);
t4 = vec_perm(c3[1], c4[1], swiz2);
t5 = vec_perm(t1, t3, swiz3);
t6 = vec_perm(t1, t3, swiz4);
t7 = vec_perm(t2, t4, swiz3);
t8 = vec_perm(t2, t4, swiz4);
if (flip == true) {
t5 = vec_xor(t5, xor_vector);
t6 = vec_xor(t6, xor_vector);
t7 = vec_xor(t7, xor_vector);
t8 = vec_xor(t8, xor_vector);
}
vec_xst(t5, 0, vecOffset+64);
vec_xst(t6, 0, vecOffset+80);
vec_xst(t7, 0, vecOffset+96);
vec_xst(t8, 0, vecOffset+112);
aoffset1 += lda;
aoffset2 += lda;
aoffset3 += lda;
vecOffset += 128;
i--;
} while(i > 0);
}
}
}
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t mc, nc, mp, np;
int m_rem = MIN(m - m0, 8);
int n_rem = MIN(n - n0, 8);
// TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
// issues. After resolving them, below code will be enabled.
/*if (m_rem >= 16 && n_rem >= 8) {
mc = 16;
nc = 8;
gemm<16,8>(m0, m, n0, n);
} else if(m_rem >= 8 && n_rem >= 16) {
mc = 8;
nc = 16;
gemm<8,16>(m0, m, n0, n);
}*/
if (m_rem >= 8 && n_rem >= 8) {
mc = 8;
nc = 8;
gemm<8,8>(m0, m, n0, n);
} else if (m_rem >= 4 && n_rem >= 8) {
mc = 4;
nc = 8;
gemm<4,8>(m0, m, n0, n);
} else if (m_rem >= 8 && n_rem >= 4) {
mc = 8;
nc = 4;
gemm<8,4>(m0, m, n0, n);
} else if (m_rem >= 4 && n_rem >= 4) {
mc = 4;
nc = 4;
gemm_small<4, 4>(m0, m, n0, n);
} else if ((m_rem < 4) && (n_rem > 4)) {
nc = 4;
switch(m_rem) {
case 1:
mc = 1;
gemm_small<1, 4>(m0, m, n0, n);
break;
case 2:
mc = 2;
gemm_small<2, 4>(m0, m, n0, n);
break;
case 3:
mc = 3;
gemm_small<3, 4>(m0, m, n0, n);
break;
default:
return;
}
} else if ((m_rem > 4) && (n_rem < 4)) {
mc = 4;
switch(n_rem) {
case 1:
nc = 1;
gemm_small<4, 1>(m0, m, n0, n);
break;
case 2:
nc = 2;
gemm_small<4, 2>(m0, m, n0, n);
break;
case 3:
nc = 3;
gemm_small<4, 3>(m0, m, n0, n);
break;
default:
return;
}
} else {
switch((m_rem << 4) | n_rem) {
case 0x43:
mc = 4;
nc = 3;
gemm_small<4, 3>(m0, m, n0, n);
break;
case 0x42:
mc = 4;
nc = 2;
gemm_small<4, 2>(m0, m, n0, n);
break;
case 0x41:
mc = 4;
nc = 1;
gemm_small<4, 1>(m0, m, n0, n);
break;
case 0x34:
mc = 3;
nc = 4;
gemm_small<3, 4>(m0, m, n0, n);
break;
case 0x33:
mc = 3;
nc = 3;
gemm_small<3, 3>(m0, m, n0, n);
break;
case 0x32:
mc = 3;
nc = 2;
gemm_small<3, 2>(m0, m, n0, n);
break;
case 0x31:
mc = 3;
nc = 1;
gemm_small<3, 1>(m0, m, n0, n);
break;
case 0x24:
mc = 2;
nc = 4;
gemm_small<2, 4>(m0, m, n0, n);
break;
case 0x23:
mc = 2;
nc = 3;
gemm_small<2, 3>(m0, m, n0, n);
break;
case 0x22:
mc = 2;
nc = 2;
gemm_small<2, 2>(m0, m, n0, n);
break;
case 0x21:
mc = 2;
nc = 1;
gemm_small<2, 1>(m0, m, n0, n);
break;
case 0x14:
mc = 1;
nc = 4;
gemm_small<1, 4>(m0, m, n0, n);
break;
case 0x13:
mc = 1;
nc = 3;
gemm_small<1, 3>(m0, m, n0, n);
break;
case 0x12:
mc = 1;
nc = 2;
gemm_small<1, 2>(m0, m, n0, n);
break;
case 0x11:
mc = 1;
nc = 1;
gemm_small<1, 1>(m0, m, n0, n);
break;
default:
return;
}
}
mp = m0 + (m - m0) / mc * mc;
np = n0 + (n - n0) / nc * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
}
void KERNEL_4x8(int64_t ii, int64_t jj) {
vec_t vec_A[8], vec_B[16] = {0};
acc_t acc_0, acc_1;
std::array<int, 4> comparray;
vector float fin_res[8] = {0};
vector float vs[8] = {0};
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
}
for (int I = 0; I<4; I++) {
for (int J = 0; J<4; J++) {
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
}
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 4; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
}
save_res<4, 4>(ii, jj, 0, fin_res);
save_res<4, 4>(ii, jj+4, 4, fin_res);
}
void KERNEL_8x4(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[8] = {0};
acc_t acc_0, acc_1;
std::array<int, 8> comparray;
vector float fin_res[8] = {0};
vector float vs[8] = {0};
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
}
for (int I = 0; I<8; I++) {
for (int J = 0; J<4; J++) {
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
}
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
}
save_res<4, 4>(ii, jj, 0, fin_res);
save_res<4, 4>(ii+4, jj, 4, fin_res);
}
void KERNEL_8x8(int64_t ii, int64_t jj) {
vec_t vec_A[16], vec_B[16] = {0};
acc_t acc_0, acc_1, acc_2, acc_3;
std::array<int, 8> comparray;
vector float fin_res[16] = {0};
vector float vs[16] = {0};
for (int l = 0; l < k; l++) {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
__builtin_mma_xxsetaccz(&acc_2);
__builtin_mma_xxsetaccz(&acc_3);
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x++) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
}
for (int I = 0; I<8; I++) {
for (int J = 0; J<4; J++) {
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
}
}
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < 8; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
}
save_res<4, 4>(ii, jj, 0, fin_res);
save_res<4, 4>(ii+4, jj, 4, fin_res);
save_res<4, 4>(ii, jj+4, 8, fin_res);
save_res<4, 4>(ii+4, jj+4, 12, fin_res);
}
template<int RM, int RN>
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
vec_t vec_A[8], vec_B[8] = {0};
vector signed int vec_C[4];
acc_t acc_0;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
std::array<int, RM> comparray;
vector float res[4] = {0};
vector float fin_res[4] = {0};
vector float vs[4] = {0};
vector float CA[4] = {0};
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
for (int l = 0; l < k; l++) {
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
__builtin_mma_xxsetaccz(&acc_0);
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
for(int x = 0; x < 8; x+=4) {
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
}
for (int I = 0; I<RM; I++) {
for (int J = 0; J<RN; J++) {
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
}
}
__builtin_mma_disassemble_acc(vec_C, &acc_0);
auto aoffset = A+(ii*lda)+l;
for (int i = 0; i < RM; i++) {
comparray[i] = 0;
int ca = 0;
const int8_t *at = aoffset->qs;
for (int j = 0; j < 32; j++)
ca += (int)*at++;
comparray[i] = ca;
aoffset += lda;
}
for (int i = 0; i < RM; i++) {
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
}
}
save_res<RM, RN>(ii, jj, 0, fin_res);
}
}
template<int RM, int RN>
inline void kernel(int64_t ii, int64_t jj) {
if constexpr(RM == 4 && RN == 8) {
KERNEL_4x8(ii,jj);
} else if constexpr(RM == 8 && RN == 4) {
KERNEL_8x4(ii,jj);
} else if constexpr(RM == 8 && RN == 8) {
KERNEL_8x8(ii,jj);
} else {
static_assert(false, "RN/RM values not supported");
}
}
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
kernel<RM, RN>(ii, jj);
}
}
const TA *const A;
const TB *const B;
TC *C;
TA *At;
TB *Bt;
const int64_t k;
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
const int ith;
const int nth;
};
template <typename TA, typename TB, typename TC> template <typename TA, typename TB, typename TC>
class tinyBLAS_PPC { class tinyBLAS_PPC {
public: public:
@ -1070,13 +1769,17 @@ class tinyBLAS_PPC {
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) { template<typename VA>
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
int64_t i, j; int64_t i, j;
float *aoffset = NULL, *boffset = NULL; TA *aoffset = NULL, *boffset = NULL;
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
aoffset = const_cast<float*>(a); VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
VA t1, t2, t3, t4, t5, t6, t7, t8;
aoffset = const_cast<TA*>(a);
boffset = vec; boffset = vec;
j = (rows >> 3); j = (rows >> 3);
if (j > 0) { if (j > 0) {
@ -1092,9 +1795,6 @@ class tinyBLAS_PPC {
aoffset += 8 * lda; aoffset += 8 * lda;
i = (cols >> 3); i = (cols >> 3);
if (i > 0) { if (i > 0) {
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
vector float t1, t2, t3, t4, t5, t6, t7, t8;
do { do {
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
@ -1174,21 +1874,19 @@ class tinyBLAS_PPC {
} while(i > 0); } while(i > 0);
} }
if (cols & 4) { if (cols & 4) {
vector float c1, c2, c3, c4, c5, c6, c7, c8; c1[0] = vec_xl(0, aoffset1);
vector float t1, t2, t3, t4, t5, t6, t7, t8; c2[0] = vec_xl(0, aoffset2);
c1 = vec_xl(0, aoffset1); c3[0] = vec_xl(0, aoffset3);
c2 = vec_xl(0, aoffset2); c4[0] = vec_xl(0, aoffset4);
c3 = vec_xl(0, aoffset3); c5[0] = vec_xl(0, aoffset5);
c4 = vec_xl(0, aoffset4); c6[0] = vec_xl(0, aoffset6);
c5 = vec_xl(0, aoffset5); c7[0] = vec_xl(0, aoffset7);
c6 = vec_xl(0, aoffset6); c8[0] = vec_xl(0, aoffset8);
c7 = vec_xl(0, aoffset7);
c8 = vec_xl(0, aoffset8);
t1 = vec_mergeh(c1, c2); t1 = vec_mergeh(c1[0], c2[0]);
t2 = vec_mergeh(c3, c4); t2 = vec_mergeh(c3[0], c4[0]);
t3 = vec_mergeh(c5, c6); t3 = vec_mergeh(c5[0], c6[0]);
t4 = vec_mergeh(c7, c8); t4 = vec_mergeh(c7[0], c8[0]);
t5 = vec_xxpermdi(t1, t2, 0); t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0); t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3); t7 = vec_xxpermdi(t1, t2, 3);
@ -1198,10 +1896,10 @@ class tinyBLAS_PPC {
vec_xst(t7, 0, boffset+8); vec_xst(t7, 0, boffset+8);
vec_xst(t8, 0, boffset+12); vec_xst(t8, 0, boffset+12);
t1 = vec_mergel(c1, c2); t1 = vec_mergel(c1[0], c2[0]);
t2 = vec_mergel(c3, c4); t2 = vec_mergel(c3[0], c4[0]);
t3 = vec_mergel(c5, c6); t3 = vec_mergel(c5[0], c6[0]);
t4 = vec_mergel(c7, c8); t4 = vec_mergel(c7[0], c8[0]);
t5 = vec_xxpermdi(t1, t2, 0); t5 = vec_xxpermdi(t1, t2, 0);
t6 = vec_xxpermdi(t3, t4, 0); t6 = vec_xxpermdi(t3, t4, 0);
t7 = vec_xxpermdi(t1, t2, 3); t7 = vec_xxpermdi(t1, t2, 3);
@ -1223,9 +1921,6 @@ class tinyBLAS_PPC {
aoffset += 4 * lda; aoffset += 4 * lda;
i = (cols >> 3); i = (cols >> 3);
if (i > 0) { if (i > 0) {
__vector_pair C1, C2, C3, C4;
vector float c1[2], c2[2], c3[2], c4[2];
vector float t1, t2, t3, t4, t5, t6, t7, t8;
do { do {
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
@ -1272,22 +1967,20 @@ class tinyBLAS_PPC {
} }
if (cols & 4) { if (cols & 4) {
vector float c1, c2, c3, c4; c1[0] = vec_xl(0, aoffset1);
vector float t1, t2, t3, t4; c2[0] = vec_xl(0, aoffset2);
c1 = vec_xl(0, aoffset1); c3[0] = vec_xl(0, aoffset3);
c2 = vec_xl(0, aoffset2); c4[0] = vec_xl(0, aoffset4);
c3 = vec_xl(0, aoffset3);
c4 = vec_xl(0, aoffset4);
t1 = vec_mergeh(c1, c2); t1 = vec_mergeh(c1[0], c2[0]);
t2 = vec_mergeh(c3, c4); t2 = vec_mergeh(c3[0], c4[0]);
t3 = vec_xxpermdi(t1, t2, 0); t3 = vec_xxpermdi(t1, t2, 0);
t4 = vec_xxpermdi(t1, t2, 3); t4 = vec_xxpermdi(t1, t2, 3);
vec_xst(t3, 0, boffset); vec_xst(t3, 0, boffset);
vec_xst(t4, 0, boffset+4); vec_xst(t4, 0, boffset+4);
t1 = vec_mergel(c1, c2); t1 = vec_mergel(c1[0], c2[0]);
t2 = vec_mergel(c3, c4); t2 = vec_mergel(c3[0], c4[0]);
t3 = vec_xxpermdi(t1, t2, 0); t3 = vec_xxpermdi(t1, t2, 0);
t4 = vec_xxpermdi(t1, t2, 3); t4 = vec_xxpermdi(t1, t2, 3);
vec_xst(t3, 0, boffset+8); vec_xst(t3, 0, boffset+8);
@ -1299,21 +1992,19 @@ class tinyBLAS_PPC {
aoffset2 = aoffset1 + lda; aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda; aoffset3 = aoffset2 + lda;
if (cols & 4) { if (cols & 4) {
vector float c1, c2, c3, c4 = {0}; c1[0] = vec_xl(0, aoffset1);
vector float t1, t2, t3, t4; c2[0] = vec_xl(0, aoffset2);
c1 = vec_xl(0, aoffset1); c3[0] = vec_xl(0, aoffset3);
c2 = vec_xl(0, aoffset2);
c3 = vec_xl(0, aoffset3);
t1 = vec_mergeh(c1, c2); t1 = vec_mergeh(c1[0], c2[0]);
t2 = vec_mergeh(c3, c4); t2 = vec_mergeh(c3[0], c4[0]);
t3 = vec_xxpermdi(t1, t2, 0); t3 = vec_xxpermdi(t1, t2, 0);
t4 = vec_xxpermdi(t1, t2, 3); t4 = vec_xxpermdi(t1, t2, 3);
vec_xst(t3, 0, boffset); vec_xst(t3, 0, boffset);
vec_xst(t4, 0, boffset+4); vec_xst(t4, 0, boffset+4);
t1 = vec_mergel(c1, c2); t1 = vec_mergel(c1[0], c2[0]);
t2 = vec_mergel(c3, c4); t2 = vec_mergel(c3[0], c4[0]);
t3 = vec_xxpermdi(t1, t2, 0); t3 = vec_xxpermdi(t1, t2, 0);
t4 = vec_xxpermdi(t1, t2, 3); t4 = vec_xxpermdi(t1, t2, 3);
vec_xst(t3, 0, boffset+8); vec_xst(t3, 0, boffset+8);
@ -1321,14 +2012,13 @@ class tinyBLAS_PPC {
} }
} }
} }
void KERNEL_4x4(int64_t ii, int64_t jj) { void KERNEL_4x4(int64_t ii, int64_t jj) {
vec_t vec_A[4], vec_B[4], vec_C[4]; vec_t vec_A[4], vec_B[4], vec_C[4];
acc_t acc_0; acc_t acc_0;
__builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_0);
for (int l = 0; l < k; l+=4) { for (int l = 0; l < k; l+=4) {
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
@ -1343,8 +2033,8 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1); __builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) { for (int64_t l = 0; l < k; l+=4) {
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
@ -1364,8 +2054,8 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1); __builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) { for (int64_t l = 0; l < k; l+=4) {
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A); packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
@ -1387,8 +2077,8 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_2);
__builtin_mma_xxsetaccz(&acc_3); __builtin_mma_xxsetaccz(&acc_3);
for (int l = 0; l < k; l+=8) { for (int l = 0; l < k; l+=8) {
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A); packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
for(int x = 0; x < 16; x+=2) { for(int x = 0; x < 16; x+=2) {
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
@ -1571,15 +2261,15 @@ class tinyBLAS_PPC {
vec_t vec_A[4], vec_B[4]; vec_t vec_A[4], vec_B[4];
for (int l=0; l<k; l+=4) { for (int l=0; l<k; l+=4) {
if (RN >= 4 && RM == 1) { if (RN >= 4 && RM == 1) {
float* a = const_cast<float*>(A+(ii)*lda+l); TA* a = const_cast<TA*>(A+(ii)*lda+l);
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
vec_A[0] = (vec_t)vec_xl(0,a); vec_A[0] = (vec_t)vec_xl(0,a);
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1)); vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2)); vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3)); vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
} else { } else {
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
} }
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
@ -1589,7 +2279,7 @@ class tinyBLAS_PPC {
__builtin_mma_disassemble_acc(vec_C, &acc_0); __builtin_mma_disassemble_acc(vec_C, &acc_0);
for (int I = 0; I < RM; I++) { for (int I = 0; I < RM; I++) {
for (int J = 0; J < RN; J++) { for (int J = 0; J < RN; J++) {
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
} }
} }
} }
@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
params->ith, params->nth}; params->ith, params->nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#elif defined(__MMA__)
if (n < 8 && n != 4)
return false;
if (m < 8 && m != 4)
return false;
tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else #else
return false; return false;
#endif #endif

View file

@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb1, uint64_t nb1,
uint64_t nb2, uint64_t nb2,
uint64_t nb3){ uint64_t nb3){
static_assert(dim >= 0 && dim <= 3); static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
const int64_t i3 = blockIdx.z; const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y; const int64_t i2 = blockIdx.y;

View file

@ -15,13 +15,13 @@ pip install gguf
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. [examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. [gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. [gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. [gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. [gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
## Development ## Development
Maintainers who participate in development of this package are advised to install it in editable mode: Maintainers who participate in development of this package are advised to install it in editable mode:

View file

@ -1,12 +1,11 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.13.0" version = "0.14.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [
{include = "gguf"}, {include = "gguf"},
{include = "gguf/py.typed"}, {include = "gguf/py.typed"},
{include = "scripts"},
] ]
readme = "README.md" readme = "README.md"
homepage = "https://ggml.ai" homepage = "https://ggml.ai"
@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts] [tool.poetry.scripts]
gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
gguf-dump = "scripts:gguf_dump_entrypoint" gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"

View file

@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
} else { } else {
ab_map[name].b = cur; ab_map[name].b = cur;
} }
} else if (str_endswith(name, "_norm.weight")) {
// TODO: add support for norm vector
// for now, we don't really care because most adapters still work fine without it
continue;
} else { } else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
} }
@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
for (auto & it : ab_map) { for (auto & it : ab_map) {
const std::string & name = it.first; const std::string & name = it.first;
llama_lora_weight & w = it.second; llama_lora_weight & w = it.second;
bool is_token_embd = str_endswith(name, "token_embd.weight");
if (!w.a || !w.b) { if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
@ -259,17 +264,24 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
// device buft and device ctx // device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str()); auto * model_tensor = llama_model_get_tensor(model, name.c_str());
if (!model_tensor) { if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
} }
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape // validate tensor shape
if (is_token_embd) {
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
} else {
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape"); throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
} }
if (w.a->ne[1] != w.b->ne[0]) { if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
} }
}
// save tensor to adapter // save tensor to adapter
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);

View file

@ -45,6 +45,13 @@ struct llama_lora_weight {
struct ggml_tensor * a = nullptr; struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr; struct ggml_tensor * b = nullptr;
// get actual scale based on rank and alpha
float get_scale(float alpha, float adapter_scale) {
const float rank = (float) b->ne[0];
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
return scale;
}
llama_lora_weight() = default; llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
}; };

View file

@ -7,14 +7,12 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <cinttypes>
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
// TODO: replace with ggml API call
#define QK_K 256
static void zeros(std::ofstream & file, size_t n) { static void zeros(std::ofstream & file, size_t n) {
char zero = 0; char zero = 0;
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type; new_type = qs.params->output_tensor_type;
} else { } else {
int nx = tensor->ne[0]; const int64_t nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { const int64_t qk_k = ggml_blck_size(new_type);
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//} //}
bool convert_incompatible_tensor = false; bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || {
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || const int64_t nx = tensor->ne[0];
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || const int64_t ny = tensor->ne[1];
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || const int64_t qk_k = ggml_blck_size(new_type);
new_type == GGML_TYPE_IQ1_M) {
int nx = tensor->ne[0]; if (nx % qk_k != 0) {
int ny = tensor->ne[1]; LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
convert_incompatible_tensor = true; convert_incompatible_tensor = true;
} else { } else {
++qs.n_k_quantized; ++qs.n_k_quantized;
} }
} }
if (convert_incompatible_tensor) { if (convert_incompatible_tensor) {
switch (new_type) { switch (new_type) {
case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ1_0:

View file

@ -2652,6 +2652,21 @@ static struct ggml_tensor * llm_build_inp_embd(
ggml_set_input(lctx.inp_tokens); ggml_set_input(lctx.inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
// apply lora for embedding tokens if needed
for (auto & it : lctx.lora_adapters) {
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
if (lora == nullptr) {
continue;
}
const float adapter_scale = it.second;
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
ctx, lora->b, // non-transposed lora_b
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
), scale);
inpL = ggml_add(ctx, inpL, inpL_delta);
}
} else { } else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd; inpL = lctx.inp_embd;
@ -2724,9 +2739,8 @@ static struct ggml_tensor * llm_build_lora_mm(
if (lora == nullptr) { if (lora == nullptr) {
continue; continue;
} }
const float alpha = it.first->alpha; const float adapter_scale = it.second;
const float rank = (float) lora->b->ne[0]; const float scale = lora->get_scale(it.first->alpha, adapter_scale);
const float scale = alpha ? it.second * alpha / rank : it.second;
struct ggml_tensor * ab_cur = ggml_mul_mat( struct ggml_tensor * ab_cur = ggml_mul_mat(
ctx0, lora->b, ctx0, lora->b,
ggml_mul_mat(ctx0, lora->a, cur) ggml_mul_mat(ctx0, lora->a, cur)
@ -4092,6 +4106,7 @@ struct llm_build_context {
// feed-forward network // feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) { if (model.layers[il].ffn_gate_inp == nullptr) {
cur = llm_build_norm(ctx0, ffn_inp, hparams, cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL, model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);