fix kappa and memory bounds, account for look-up table and input/output layer delay

2025-09-06 08:59:09 +00:00 · 2025-01-25 22:31:40 +04:00 · 2025-01-25 22:31:40 +04:00 · f3dd5776eb
commit f3dd5776eb
parent 9e4ba4f06a
2 changed files with 34 additions and 34 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -832,7 +832,7 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

-static bool assign_device(
+static bool assign_layers_to_device(
                                uint32_t   n_world, 
                                uint32_t   my_rank, 
                       const device_info * dev_info_set, 
@ -857,6 +857,7 @@ static bool assign_device(
    // model-specific constants
    const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
    const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
+    const int n_vocab      = llama_n_vocab(model);
    const int n_kv         = cparams.n_ctx;

    const int64_t b        = dev_info_set[0].model_bytes.nb_layer;
@ -876,7 +877,7 @@ static bool assign_device(
    // -------- Compute alpha[m], beta[m], xi[m] --------
    for (uint32_t m = 0; m < n_world; ++m) {
        // alpha[m]
-        const device_info &dev = dev_info_set[m];
+        const device_info & dev = dev_info_set[m];
        float t_calc_cpu = (
            master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
            master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
@ -931,7 +932,7 @@ static bool assign_device(
    // - $d_m^{\text{avail}}+d_m^{\text{swapout}}$ for Android
    // and $n_m$ is initialized to 0. 
    for (uint32_t m = 0; m < n_world; ++m) {
-        const device_info &dev = dev_info_set[m];
+        const device_info & dev = dev_info_set[m];
        GGML_ASSERT(dev.device_os != nullptr);

        bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
@ -968,7 +969,7 @@ static bool assign_device(
    // stores the actual read bandwidth (GB/s) for each device
    std::vector<float> disk_speed(n_world, 0.0f);
    for (uint32_t m = 0; m < n_world; ++m) {
-        const device_info &dev = dev_info_set[m];
+        const device_info & dev = dev_info_set[m];
        GGML_ASSERT(dev.device_os != nullptr);
        bool is_linux = strcmp(dev.device_os, "Linux") == 0;

@ -1010,7 +1011,7 @@ static bool assign_device(
        M1.clear(), M2.clear(), M3.clear(), M4.clear();

        for (uint32_t m = 0; m < n_world; ++m) {
-            const device_info &dev = dev_info_set[m];
+            const device_info & dev = dev_info_set[m];

            GGML_ASSERT(dev.device_os != nullptr);
            bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
@ -1023,9 +1024,9 @@ static bool assign_device(

            int  l_m          = w[m] * k;  // total number of layers assigned to device m
            int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
-            bool condition1   = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
-            bool condition2   = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
-            bool condition3   = (l_m - l_m_gpu) * b_prime + (bi + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
+            bool condition1   = l_m * b + (bi / n_vocab + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
+            bool condition2   = l_m * b + (bi / n_vocab + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
+            bool condition3   = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
            bool is_slow_disk = disk_speed[m] < min_disk_read_speed;

            if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
@ -1083,13 +1084,26 @@ static bool assign_device(

        // update kappa
        for (uint32_t m = 0; m < n_world; ++m) {
-            const device_info &dev = dev_info_set[m];
+            const device_info & dev = dev_info_set[m];
            GGML_ASSERT(dev.device_os != nullptr);
            bool is_android = strcmp(dev.device_os, "Android") == 0;

-            if (m == 0 && !in_set(m, M4)) {
-                kappa = (bi + bo) / (disk_speed[m] * 1e9) * 1000;  // in ms
+            if (m == 0) {
+                kappa = (
+                    dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+
+                kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
+
+                if (!in_set(m, M4)) {
+                    kappa += (bi / n_vocab + bo) / (disk_speed[m] * 1e9) * 1000; // in ms
+                }
            }
+
            if (in_set(m, M3)) {
                kappa += (c_cpu[m] - dev.memory.available_physical * GIGABYTE - dev.memory.used_can_swap * GIGABYTE * int(is_android)) / (disk_speed[m] * 1e9) * 1000; // in ms
            }
@ -1128,23 +1142,11 @@ static bool assign_device(
        // -------------------------------------------------------------
        // Construct vectors vz, vz_gpu
        // -------------------------------------------------------------
-        // z and z_gpu are used to express memory constraints:
-        // for z:
-        //   - M1:  (d_m^{avail} - b_cio) / (L*b')
-        //   - M2:  (d_m^{total} - b_cio - c_gpu) / (L*b')
-        //   - M3:  (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b')
-        //   - M4:  - (d_m^{avail} - b_cio) / (L*b') on macOS without Metal,
-        //       or - (d_m^{total} - b_cio - c_gpu) / (L*b') on macOS with Metal,
-        //       or - (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b') on Linux or Android
-        //
-        // for z_gpu:
-        //   - M1:  (d_{m,cuda}^{avail} - c_gpu) / (L*b'),
-        // d_{m,cuda}^{avail} is non-zero only if the device supports CUDA
        std::vector<float> vec_z(n_world, 0.0f), vec_z_gpu(n_world, 0.0f);
        std::vector<int> dev_gpu(n_world, 0);

        for (uint32_t m = 0; m < n_world; ++m) {
-            const device_info &dev = dev_info_set[m];
+            const device_info & dev = dev_info_set[m];

            GGML_ASSERT(dev.device_os != nullptr);
            bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
@ -1152,7 +1154,7 @@ static bool assign_device(
            bool is_windows = strcmp(dev.device_os, "Windows") == 0;
            GGML_ASSERT(!is_windows && "Windows is not tested yet\n");

-            int64_t b_cio = (bi + bo) * int(m == 0) + c_cpu[m];
+            int64_t b_cio = (bi / n_vocab + bo) * int(m == 0) + c_cpu[m];

            if (in_set(m, M1)) {
                vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
@ -1174,7 +1176,7 @@ static bool assign_device(
                float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
                vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
                if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
-                    vec_z_gpu[m] -= (double)(bi + bo) / (double)(n_layer * b_prime);
+                    vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
                }
                dev_gpu[m] = 1;
            } else {
@ -1269,7 +1271,7 @@ static bool assign_device(
            
            // constraint coefficients 3: RAM constraint for each device
            for (uint32_t m = 0; m < n_world; ++m) {
-                const device_info &dev = dev_info_set[m];
+                const device_info & dev = dev_info_set[m];
                GGML_ASSERT(dev.device_os != nullptr);
                bool is_macos = strcmp(dev.device_os, "macOS") == 0;
                int cons_row = constraint_idx + m;
@ -1373,10 +1375,11 @@ static bool assign_device(
        GGML_ASSERT(final_solution[m] == w[m] && final_solution[m + n_world] == n[m]);
        LOG_INF("\n%s:\n", device_name);
        LOG_INF("  - Device Index   : %d\n", m);
+        LOG_INF("  - Assignment Set : %s\n", in_set(m, M1) ? "M1" : in_set(m, M2) ? "M2" : in_set(m, M3) ? "M3" : "M4");
        LOG_INF("  - N Layer Window : %d\n", w[m]);
        LOG_INF("  - N GPU Layers   : %d\n", n[m]);
    }
-    LOG_INF("\nTotal Latency: %.3f ms\n", final_objective);
+    LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
    LOG_INF("------------------------------------------");

 #else
@ -1478,7 +1481,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
        if (my_rank == 0) {
            // automatically determine n_layer_window and n_gpu_layers
-            if (!assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
+            if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
                LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
                llama_free(lctx);
                llama_free_model(model);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5255,14 +5255,11 @@ struct llama_model_loader {
        const auto & mapping = mappings.at(idx);
        *addr = mapping->addr;

-        auto merge_tensor_range = [&](ggml_context * context, bool keep_only_inp_out) {
+        auto merge_tensor_range = [&](ggml_context * context, bool keep_output) {
            for (ggml_tensor * tensor = ggml_get_first_tensor(context); tensor; tensor = ggml_get_next_tensor(context, tensor)) {
                try {
                    const char * tname = ggml_get_name(tensor);
-                    if (keep_only_inp_out && !(
-                            // strcmp(tname, "token_embd.weight") == 0 || // lookup table is used so we do not need to keep it in metal memory
-                            strcmp(tname, "output_norm.weight") == 0 || 
-                            strcmp(tname, "output.weight") == 0)) {
+                    if (keep_output && !(strcmp(tname, "output_norm.weight") == 0 || strcmp(tname, "output.weight") == 0)) {
                        continue;
                    }