mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # examples/run/run.cpp # examples/server/README.md # scripts/sync-ggml.last
This commit is contained in:
commit
1b49dc305f
18 changed files with 888 additions and 107 deletions
|
@ -23,6 +23,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
|
|||
return *this;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
|
||||
this->excludes = std::move(excludes);
|
||||
return *this;
|
||||
}
|
||||
|
||||
common_arg & common_arg::set_env(const char * env) {
|
||||
help = help + "\n(env: " + env + ")";
|
||||
this->env = env;
|
||||
|
@ -38,6 +43,10 @@ bool common_arg::in_example(enum llama_example ex) {
|
|||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
|
||||
bool common_arg::is_exclude(enum llama_example ex) {
|
||||
return excludes.find(ex) != excludes.end();
|
||||
}
|
||||
|
||||
bool common_arg::get_value_from_env(std::string & output) {
|
||||
if (env == nullptr) return false;
|
||||
char * value = std::getenv(env);
|
||||
|
@ -421,7 +430,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
*/
|
||||
auto add_opt = [&](common_arg arg) {
|
||||
if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
|
||||
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
|
||||
ctx_arg.options.push_back(std::move(arg));
|
||||
}
|
||||
};
|
||||
|
@ -650,7 +659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, const std::string & value) {
|
||||
params.prompt = value;
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--no-perf"},
|
||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
|
@ -674,7 +683,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.prompt.pop_back();
|
||||
}
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--in-file"}, "FNAME",
|
||||
"an input file (repeat to specify multiple files)",
|
||||
|
@ -701,7 +710,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.prompt = ss.str();
|
||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
|
||||
}
|
||||
));
|
||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-e", "--escape"},
|
||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
struct common_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::set<enum llama_example> excludes = {};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
|
@ -53,9 +54,11 @@ struct common_arg {
|
|||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
|
|
|
@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
|||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||
return base_name
|
||||
|
||||
|
||||
|
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
|
|||
"--base", type=Path,
|
||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-model-id", type=str,
|
||||
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"lora_path", type=Path,
|
||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||
|
@ -290,6 +297,7 @@ if __name__ == '__main__':
|
|||
|
||||
dir_base_model: Path | None = args.base
|
||||
dir_lora: Path = args.lora_path
|
||||
base_model_id: str | None = args.base_model_id
|
||||
lora_config = dir_lora / "adapter_config.json"
|
||||
input_model = dir_lora / "adapter_model.safetensors"
|
||||
|
||||
|
@ -313,7 +321,10 @@ if __name__ == '__main__':
|
|||
lparams: dict[str, Any] = json.load(f)
|
||||
|
||||
# load base model
|
||||
if dir_base_model is None:
|
||||
if base_model_id is not None:
|
||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||
hparams = load_hparams_from_hf(base_model_id)
|
||||
elif dir_base_model is None:
|
||||
if "base_model_name_or_path" in lparams:
|
||||
model_id = lparams["base_model_name_or_path"]
|
||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||
|
@ -371,11 +382,16 @@ if __name__ == '__main__':
|
|||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
is_lora_a = ".lora_A.weight" in name
|
||||
is_lora_b = ".lora_B.weight" in name
|
||||
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||
if not is_lora_a and not is_lora_b:
|
||||
if ".base_layer.weight" in name:
|
||||
continue
|
||||
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
||||
if "_layernorm" in name or ".norm" in name:
|
||||
yield (base_name, tensor)
|
||||
continue
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||
|
@ -407,9 +423,21 @@ if __name__ == '__main__':
|
|||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
# mergekit-extract-lora add these layernorm to the adapter
|
||||
if "_norm" in dest_name:
|
||||
assert dest_data.dim() == 1
|
||||
yield (dest_name, dest_data)
|
||||
continue
|
||||
|
||||
# otherwise, we must get the lora_A and lora_B tensors
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
||||
# note: mergekit-extract-lora flip and transpose A and B
|
||||
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
||||
if "token_embd.weight" in dest_name:
|
||||
lora_a = lora_a.T
|
||||
|
||||
yield (dest_name + ".lora_a", lora_a)
|
||||
yield (dest_name + ".lora_b", lora_b)
|
||||
|
||||
|
|
|
@ -575,4 +575,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||
if (backend_path) {
|
||||
ggml_backend_load(backend_path);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,6 +54,7 @@
|
|||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
|
@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX {
|
|||
} \
|
||||
} \
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_Q0_PPC {
|
||||
public:
|
||||
tinyBLAS_Q0_PPC(int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int size>
|
||||
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
||||
vector signed int vec_C[4];
|
||||
vector float CA[4] = {0};
|
||||
vector float res[4] = {0};
|
||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
|
||||
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
|
||||
VB t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
vector unsigned char xor_vector;
|
||||
uint8_t flip_vec = 0x80;
|
||||
xor_vector = vec_splats(flip_vec);
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||
C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
|
||||
C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
|
||||
C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
|
||||
C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
|
||||
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
__builtin_vsx_disassemble_pair(c5, &C5);
|
||||
__builtin_vsx_disassemble_pair(c6, &C6);
|
||||
__builtin_vsx_disassemble_pair(c7, &C7);
|
||||
__builtin_vsx_disassemble_pair(c8, &C8);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
t1 = vec_perm(c5[0], c6[0], swiz1);
|
||||
t2 = vec_perm(c5[0], c6[0], swiz2);
|
||||
t3 = vec_perm(c7[0], c8[0], swiz1);
|
||||
t4 = vec_perm(c7[0], c8[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+128);
|
||||
vec_xst(t6, 0, vecOffset+144);
|
||||
vec_xst(t7, 0, vecOffset+160);
|
||||
vec_xst(t8, 0, vecOffset+176);
|
||||
|
||||
t1 = vec_perm(c5[1], c6[1], swiz1);
|
||||
t2 = vec_perm(c5[1], c6[1], swiz2);
|
||||
t3 = vec_perm(c7[1], c8[1], swiz1);
|
||||
t4 = vec_perm(c7[1], c8[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+192);
|
||||
vec_xst(t6, 0, vecOffset+208);
|
||||
vec_xst(t7, 0, vecOffset+224);
|
||||
vec_xst(t8, 0, vecOffset+240);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
aoffset5 += lda;
|
||||
aoffset6 += lda;
|
||||
aoffset7 += lda;
|
||||
aoffset8 += lda;
|
||||
vecOffset += 256;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
j--;
|
||||
} while(j > 0);
|
||||
}
|
||||
|
||||
if (rows & 4) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
|
||||
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
__builtin_vsx_disassemble_pair(c4, &C4);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
if (rows & 3) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
__builtin_vsx_disassemble_pair(c3, &C3);
|
||||
case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
__builtin_vsx_disassemble_pair(c2, &C2);
|
||||
case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
__builtin_vsx_disassemble_pair(c1, &C1);
|
||||
break;
|
||||
}
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
int m_rem = MIN(m - m0, 8);
|
||||
int n_rem = MIN(n - n0, 8);
|
||||
// TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
|
||||
// issues. After resolving them, below code will be enabled.
|
||||
/*if (m_rem >= 16 && n_rem >= 8) {
|
||||
mc = 16;
|
||||
nc = 8;
|
||||
gemm<16,8>(m0, m, n0, n);
|
||||
} else if(m_rem >= 8 && n_rem >= 16) {
|
||||
mc = 8;
|
||||
nc = 16;
|
||||
gemm<8,16>(m0, m, n0, n);
|
||||
}*/
|
||||
if (m_rem >= 8 && n_rem >= 8) {
|
||||
mc = 8;
|
||||
nc = 8;
|
||||
gemm<8,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 8) {
|
||||
mc = 4;
|
||||
nc = 8;
|
||||
gemm<4,8>(m0, m, n0, n);
|
||||
} else if (m_rem >= 8 && n_rem >= 4) {
|
||||
mc = 8;
|
||||
nc = 4;
|
||||
gemm<8,4>(m0, m, n0, n);
|
||||
} else if (m_rem >= 4 && n_rem >= 4) {
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm_small<4, 4>(m0, m, n0, n);
|
||||
} else if ((m_rem < 4) && (n_rem > 4)) {
|
||||
nc = 4;
|
||||
switch(m_rem) {
|
||||
case 1:
|
||||
mc = 1;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
mc = 2;
|
||||
gemm_small<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
mc = 3;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else if ((m_rem > 4) && (n_rem < 4)) {
|
||||
mc = 4;
|
||||
switch(n_rem) {
|
||||
case 1:
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 2:
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 3:
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
switch((m_rem << 4) | n_rem) {
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm_small<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm_small<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm_small<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm_small<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm_small<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm_small<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm_small<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm_small<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm_small<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm_small<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm_small<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm_small<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm_small<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm_small<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm_small<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 4> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
|
||||
}
|
||||
for (int I = 0; I<4; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii, jj+4, 4, fin_res);
|
||||
}
|
||||
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[8] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||
}
|
||||
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[16] = {0};
|
||||
vector float vs[16] = {0};
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
|
||||
}
|
||||
for (int I = 0; I<8; I++) {
|
||||
for (int J = 0; J<4; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||
}
|
||||
save_res<4, 4>(ii, jj, 0, fin_res);
|
||||
save_res<4, 4>(ii+4, jj, 4, fin_res);
|
||||
save_res<4, 4>(ii, jj+4, 8, fin_res);
|
||||
save_res<4, 4>(ii+4, jj+4, 12, fin_res);
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
vec_t vec_A[8], vec_B[8] = {0};
|
||||
vector signed int vec_C[4];
|
||||
acc_t acc_0;
|
||||
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
std::array<int, RM> comparray;
|
||||
vector float res[4] = {0};
|
||||
vector float fin_res[4] = {0};
|
||||
vector float vs[4] = {0};
|
||||
vector float CA[4] = {0};
|
||||
__builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
|
||||
__builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x+=4) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
|
||||
}
|
||||
for (int I = 0; I<RM; I++) {
|
||||
for (int J = 0; J<RN; J++) {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < RM; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
|
||||
for (int i = 0; i < RM; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
|
||||
}
|
||||
}
|
||||
save_res<RM, RN>(ii, jj, 0, fin_res);
|
||||
}
|
||||
}
|
||||
|
||||
template<int RM, int RN>
|
||||
inline void kernel(int64_t ii, int64_t jj) {
|
||||
if constexpr(RM == 4 && RN == 8) {
|
||||
KERNEL_4x8(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 4) {
|
||||
KERNEL_8x4(ii,jj);
|
||||
} else if constexpr(RM == 8 && RN == 8) {
|
||||
KERNEL_8x8(ii,jj);
|
||||
} else {
|
||||
static_assert(false, "RN/RM values not supported");
|
||||
}
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
kernel<RM, RN>(ii, jj);
|
||||
}
|
||||
}
|
||||
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *C;
|
||||
TA *At;
|
||||
TB *Bt;
|
||||
const int64_t k;
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
class tinyBLAS_PPC {
|
||||
public:
|
||||
|
@ -1070,13 +1769,17 @@ class tinyBLAS_PPC {
|
|||
|
||||
void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
|
||||
|
||||
void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) {
|
||||
template<typename VA>
|
||||
void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
|
||||
int64_t i, j;
|
||||
float *aoffset = NULL, *boffset = NULL;
|
||||
float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
|
||||
aoffset = const_cast<float*>(a);
|
||||
TA *aoffset = NULL, *boffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
VA t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
aoffset = const_cast<TA*>(a);
|
||||
boffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
|
@ -1092,9 +1795,6 @@ class tinyBLAS_PPC {
|
|||
aoffset += 8 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
|
@ -1174,21 +1874,19 @@ class tinyBLAS_PPC {
|
|||
} while(i > 0);
|
||||
}
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4, c5, c6, c7, c8;
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
c5 = vec_xl(0, aoffset5);
|
||||
c6 = vec_xl(0, aoffset6);
|
||||
c7 = vec_xl(0, aoffset7);
|
||||
c8 = vec_xl(0, aoffset8);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
c4[0] = vec_xl(0, aoffset4);
|
||||
c5[0] = vec_xl(0, aoffset5);
|
||||
c6[0] = vec_xl(0, aoffset6);
|
||||
c7[0] = vec_xl(0, aoffset7);
|
||||
c8[0] = vec_xl(0, aoffset8);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t3 = vec_mergeh(c5, c6);
|
||||
t4 = vec_mergeh(c7, c8);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_mergeh(c5[0], c6[0]);
|
||||
t4 = vec_mergeh(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
|
@ -1198,10 +1896,10 @@ class tinyBLAS_PPC {
|
|||
vec_xst(t7, 0, boffset+8);
|
||||
vec_xst(t8, 0, boffset+12);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t3 = vec_mergel(c5, c6);
|
||||
t4 = vec_mergel(c7, c8);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_mergel(c5[0], c6[0]);
|
||||
t4 = vec_mergel(c7[0], c8[0]);
|
||||
t5 = vec_xxpermdi(t1, t2, 0);
|
||||
t6 = vec_xxpermdi(t3, t4, 0);
|
||||
t7 = vec_xxpermdi(t1, t2, 3);
|
||||
|
@ -1223,9 +1921,6 @@ class tinyBLAS_PPC {
|
|||
aoffset += 4 * lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
__vector_pair C1, C2, C3, C4;
|
||||
vector float c1[2], c2[2], c3[2], c4[2];
|
||||
vector float t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
|
||||
|
@ -1272,22 +1967,20 @@ class tinyBLAS_PPC {
|
|||
}
|
||||
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4;
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c4 = vec_xl(0, aoffset4);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
c4[0] = vec_xl(0, aoffset4);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
|
@ -1299,21 +1992,19 @@ class tinyBLAS_PPC {
|
|||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
if (cols & 4) {
|
||||
vector float c1, c2, c3, c4 = {0};
|
||||
vector float t1, t2, t3, t4;
|
||||
c1 = vec_xl(0, aoffset1);
|
||||
c2 = vec_xl(0, aoffset2);
|
||||
c3 = vec_xl(0, aoffset3);
|
||||
c1[0] = vec_xl(0, aoffset1);
|
||||
c2[0] = vec_xl(0, aoffset2);
|
||||
c3[0] = vec_xl(0, aoffset3);
|
||||
|
||||
t1 = vec_mergeh(c1, c2);
|
||||
t2 = vec_mergeh(c3, c4);
|
||||
t1 = vec_mergeh(c1[0], c2[0]);
|
||||
t2 = vec_mergeh(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset);
|
||||
vec_xst(t4, 0, boffset+4);
|
||||
|
||||
t1 = vec_mergel(c1, c2);
|
||||
t2 = vec_mergel(c3, c4);
|
||||
t1 = vec_mergel(c1[0], c2[0]);
|
||||
t2 = vec_mergel(c3[0], c4[0]);
|
||||
t3 = vec_xxpermdi(t1, t2, 0);
|
||||
t4 = vec_xxpermdi(t1, t2, 3);
|
||||
vec_xst(t3, 0, boffset+8);
|
||||
|
@ -1321,14 +2012,13 @@ class tinyBLAS_PPC {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
for (int l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
|
||||
|
@ -1343,8 +2033,8 @@ class tinyBLAS_PPC {
|
|||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
|
||||
|
@ -1364,8 +2054,8 @@ class tinyBLAS_PPC {
|
|||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
for (int64_t l = 0; l < k; l+=4) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
|
||||
|
@ -1387,8 +2077,8 @@ class tinyBLAS_PPC {
|
|||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
for (int l = 0; l < k; l+=8) {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
|
||||
for(int x = 0; x < 16; x+=2) {
|
||||
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
|
||||
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
|
||||
|
@ -1571,15 +2261,15 @@ class tinyBLAS_PPC {
|
|||
vec_t vec_A[4], vec_B[4];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
if (RN >= 4 && RM == 1) {
|
||||
float* a = const_cast<float*>(A+(ii)*lda+l);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B);
|
||||
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||
vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3));
|
||||
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||
} else {
|
||||
READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A);
|
||||
READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B);
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
}
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
|
||||
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
|
||||
|
@ -1589,7 +2279,7 @@ class tinyBLAS_PPC {
|
|||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
for (int I = 0; I < RM; I++) {
|
||||
for (int J = 0; J < RN; J++) {
|
||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J);
|
||||
*((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#elif defined(__MMA__)
|
||||
if (n < 8 && n != 4)
|
||||
return false;
|
||||
if (m < 8 && m != 4)
|
||||
return false;
|
||||
tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
|
|
@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
|||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
static_assert(dim >= 0 && dim <= 3);
|
||||
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
|
|
|
@ -15,13 +15,13 @@ pip install gguf
|
|||
|
||||
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
|
||||
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
|
||||
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
|
||||
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
|
||||
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.13.0"
|
||||
version = "0.14.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
{include = "gguf"},
|
||||
{include = "gguf/py.typed"},
|
||||
{include = "scripts"},
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
|
@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
|
||||
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||
|
|
|
@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
} else if (str_endswith(name, "_norm.weight")) {
|
||||
// TODO: add support for norm vector
|
||||
// for now, we don't really care because most adapters still work fine without it
|
||||
continue;
|
||||
} else {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||
}
|
||||
|
@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||
|
@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||
// validate tensor shape
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
if (is_token_embd) {
|
||||
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
} else {
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
}
|
||||
}
|
||||
|
||||
// save tensor to adapter
|
||||
|
|
|
@ -45,6 +45,13 @@ struct llama_lora_weight {
|
|||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
|
||||
// get actual scale based on rank and alpha
|
||||
float get_scale(float alpha, float adapter_scale) {
|
||||
const float rank = (float) b->ne[0];
|
||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||
return scale;
|
||||
}
|
||||
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
};
|
||||
|
|
|
@ -7,14 +7,12 @@
|
|||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// TODO: replace with ggml API call
|
||||
#define QK_K 256
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
|
@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
} else {
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
|
@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
||||
new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
||||
new_type == GGML_TYPE_IQ1_M) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||
{
|
||||
const int64_t nx = tensor->ne[0];
|
||||
const int64_t ny = tensor->ne[1];
|
||||
const int64_t qk_k = ggml_blck_size(new_type);
|
||||
|
||||
if (nx % qk_k != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
||||
convert_incompatible_tensor = true;
|
||||
} else {
|
||||
++qs.n_k_quantized;
|
||||
}
|
||||
}
|
||||
|
||||
if (convert_incompatible_tensor) {
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_TQ1_0:
|
||||
|
|
|
@ -2652,6 +2652,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||
|
||||
// apply lora for embedding tokens if needed
|
||||
for (auto & it : lctx.lora_adapters) {
|
||||
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
||||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||
ctx, lora->b, // non-transposed lora_b
|
||||
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
|
||||
), scale);
|
||||
inpL = ggml_add(ctx, inpL, inpL_delta);
|
||||
}
|
||||
} else {
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||
inpL = lctx.inp_embd;
|
||||
|
@ -2724,9 +2739,8 @@ static struct ggml_tensor * llm_build_lora_mm(
|
|||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float alpha = it.first->alpha;
|
||||
const float rank = (float) lora->b->ne[0];
|
||||
const float scale = alpha ? it.second * alpha / rank : it.second;
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
||||
ctx0, lora->b,
|
||||
ggml_mul_mat(ctx0, lora->a, cur)
|
||||
|
@ -4092,6 +4106,7 @@ struct llm_build_context {
|
|||
|
||||
// feed-forward network
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue