mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
resync and updated sdcpp for flux and sd3 support
This commit is contained in:
parent
33721615b5
commit
f32a874966
30 changed files with 2434248 additions and 1729 deletions
|
@ -5,18 +5,22 @@
|
|||
|
||||
#define LORA_GRAPH_SIZE 10240
|
||||
|
||||
struct LoraModel : public GGMLModule {
|
||||
struct LoraModel : public GGMLRunner {
|
||||
float multiplier = 1.0f;
|
||||
std::map<std::string, struct ggml_tensor*> lora_tensors;
|
||||
std::string file_path;
|
||||
ModelLoader model_loader;
|
||||
bool load_failed = false;
|
||||
bool load_failed = false;
|
||||
bool applied = false;
|
||||
std::vector<int> zero_index_vec = {0};
|
||||
ggml_tensor* zero_index = NULL;
|
||||
|
||||
LoraModel(ggml_backend_t backend,
|
||||
ggml_type wtype,
|
||||
const std::string file_path = "")
|
||||
: file_path(file_path), GGMLModule(backend, wtype) {
|
||||
if (!model_loader.init_from_file(file_path)) {
|
||||
const std::string& file_path = "",
|
||||
const std::string& prefix = "")
|
||||
: file_path(file_path), GGMLRunner(backend, wtype) {
|
||||
if (!model_loader.init_from_file(file_path, prefix)) {
|
||||
load_failed = true;
|
||||
}
|
||||
}
|
||||
|
@ -25,15 +29,7 @@ struct LoraModel : public GGMLModule {
|
|||
return "lora";
|
||||
}
|
||||
|
||||
size_t get_params_num() {
|
||||
return LORA_GRAPH_SIZE;
|
||||
}
|
||||
|
||||
size_t get_params_mem_size() {
|
||||
return model_loader.get_params_mem_size(NULL);
|
||||
}
|
||||
|
||||
bool load_from_file() {
|
||||
bool load_from_file(bool filter_tensor = false) {
|
||||
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
|
||||
|
||||
if (load_failed) {
|
||||
|
@ -45,6 +41,11 @@ struct LoraModel : public GGMLModule {
|
|||
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
||||
const std::string& name = tensor_storage.name;
|
||||
|
||||
if (filter_tensor && !contains(name, "lora")) {
|
||||
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dry_run) {
|
||||
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
|
||||
tensor_storage.type,
|
||||
|
@ -55,6 +56,7 @@ struct LoraModel : public GGMLModule {
|
|||
auto real = lora_tensors[name];
|
||||
*dst_tensor = real;
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@ -68,9 +70,20 @@ struct LoraModel : public GGMLModule {
|
|||
return true;
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
|
||||
ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
|
||||
auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
|
||||
out = ggml_get_rows(ctx, out, zero_index);
|
||||
out = ggml_reshape(ctx, out, a);
|
||||
return out;
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
|
||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
|
||||
|
||||
zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
|
||||
set_backend_tensor_data(zero_index, zero_index_vec.data());
|
||||
ggml_build_forward_expand(gf, zero_index);
|
||||
|
||||
std::set<std::string> applied_lora_tensors;
|
||||
for (auto it : model_tensors) {
|
||||
std::string k_tensor = it.first;
|
||||
|
@ -141,33 +154,50 @@ struct LoraModel : public GGMLModule {
|
|||
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
|
||||
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
|
||||
ggml_tensor* final_weight;
|
||||
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
|
||||
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
|
||||
// final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
|
||||
// final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
|
||||
// final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
|
||||
// } else {
|
||||
// final_weight = ggml_add_inplace(compute_ctx, weight, updown);
|
||||
// }
|
||||
final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
||||
if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
|
||||
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
|
||||
// final_weight = ggml_cpy(compute_ctx, weight, final_weight);
|
||||
final_weight = to_f32(compute_ctx, weight);
|
||||
final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
|
||||
final_weight = ggml_cpy(compute_ctx, final_weight, weight);
|
||||
} else {
|
||||
final_weight = ggml_add_inplace(compute_ctx, weight, updown);
|
||||
}
|
||||
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
||||
ggml_build_forward_expand(gf, final_weight);
|
||||
}
|
||||
|
||||
size_t total_lora_tensors_count = 0;
|
||||
size_t applied_lora_tensors_count = 0;
|
||||
|
||||
for (auto& kv : lora_tensors) {
|
||||
total_lora_tensors_count++;
|
||||
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
|
||||
LOG_WARN("unused lora tensor %s", kv.first.c_str());
|
||||
} else {
|
||||
applied_lora_tensors_count++;
|
||||
}
|
||||
}
|
||||
/* Don't worry if this message shows up twice in the logs per LoRA,
|
||||
* this function is called once to calculate the required buffer size
|
||||
* and then again to actually generate a graph to be used */
|
||||
if (applied_lora_tensors_count != total_lora_tensors_count) {
|
||||
LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
|
||||
applied_lora_tensors_count, total_lora_tensors_count);
|
||||
} else {
|
||||
LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
|
||||
applied_lora_tensors_count, total_lora_tensors_count);
|
||||
}
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(model_tensors);
|
||||
return build_lora_graph(model_tensors);
|
||||
};
|
||||
GGMLModule::compute(get_graph, n_threads, true);
|
||||
GGMLRunner::compute(get_graph, n_threads, true);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __LORA_HPP__
|
||||
#endif // __LORA_HPP__
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue