resync and updated sdcpp for flux and sd3 support

2025-09-11 01:24:36 +00:00 · 2024-11-03 22:03:16 +08:00 · 2024-11-03 22:03:16 +08:00 · f32a874966
commit f32a874966
parent 33721615b5
30 changed files with 2434248 additions and 1729 deletions
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -5,18 +5,22 @@

 #define LORA_GRAPH_SIZE 10240

-struct LoraModel : public GGMLModule {
+struct LoraModel : public GGMLRunner {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
    ModelLoader model_loader;
-    bool load_failed = false;
+    bool load_failed                = false;
+    bool applied                    = false;
+    std::vector<int> zero_index_vec = {0};
+    ggml_tensor* zero_index         = NULL;

    LoraModel(ggml_backend_t backend,
              ggml_type wtype,
-              const std::string file_path = "")
-        : file_path(file_path), GGMLModule(backend, wtype) {
-        if (!model_loader.init_from_file(file_path)) {
+              const std::string& file_path = "",
+              const std::string& prefix    = "")
+        : file_path(file_path), GGMLRunner(backend, wtype) {
+        if (!model_loader.init_from_file(file_path, prefix)) {
            load_failed = true;
        }
    }
@ -25,15 +29,7 @@ struct LoraModel : public GGMLModule {
        return "lora";
    }

-    size_t get_params_num() {
-        return LORA_GRAPH_SIZE;
-    }
-
-    size_t get_params_mem_size() {
-        return model_loader.get_params_mem_size(NULL);
-    }
-
-    bool load_from_file() {
+    bool load_from_file(bool filter_tensor = false) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());

        if (load_failed) {
@ -45,6 +41,11 @@ struct LoraModel : public GGMLModule {
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
            const std::string& name = tensor_storage.name;

+            if (filter_tensor && !contains(name, "lora")) {
+                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                return true;
+            }
+
            if (dry_run) {
                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
                                                           tensor_storage.type,
@ -55,6 +56,7 @@ struct LoraModel : public GGMLModule {
                auto real   = lora_tensors[name];
                *dst_tensor = real;
            }
+
            return true;
        };

@ -68,9 +70,20 @@ struct LoraModel : public GGMLModule {
        return true;
    }

-    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
+    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
+        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out      = ggml_get_rows(ctx, out, zero_index);
+        out      = ggml_reshape(ctx, out, a);
+        return out;
+    }
+
+    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);

+        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        set_backend_tensor_data(zero_index, zero_index_vec.data());
+        ggml_build_forward_expand(gf, zero_index);
+
        std::set<std::string> applied_lora_tensors;
        for (auto it : model_tensors) {
            std::string k_tensor       = it.first;
@ -141,33 +154,50 @@ struct LoraModel : public GGMLModule {
            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
            ggml_tensor* final_weight;
-            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-            //     final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
-            //     final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
-            //     final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
-            //     final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
-            // } else {
-            //     final_weight = ggml_add_inplace(compute_ctx, weight, updown);
-            // }
-            final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
+            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
+                // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
+                final_weight = to_f32(compute_ctx, weight);
+                final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
+                final_weight = ggml_cpy(compute_ctx, final_weight, weight);
+            } else {
+                final_weight = ggml_add_inplace(compute_ctx, weight, updown);
+            }
+            // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
            ggml_build_forward_expand(gf, final_weight);
        }

+        size_t total_lora_tensors_count   = 0;
+        size_t applied_lora_tensors_count = 0;
+
        for (auto& kv : lora_tensors) {
+            total_lora_tensors_count++;
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+            } else {
+                applied_lora_tensors_count++;
            }
        }
+        /* Don't worry if this message shows up twice in the logs per LoRA,
+         * this function is called once to calculate the required buffer size
+         * and then again to actually generate a graph to be used */
+        if (applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
+                     applied_lora_tensors_count, total_lora_tensors_count);
+        } else {
+            LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
+                      applied_lora_tensors_count, total_lora_tensors_count);
+        }

        return gf;
    }

    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(model_tensors);
+            return build_lora_graph(model_tensors);
        };
-        GGMLModule::compute(get_graph, n_threads, true);
+        GGMLRunner::compute(get_graph, n_threads, true);
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __LORA_HPP__