diff --git a/.gitignore b/.gitignore index 406bebaf..6f1e6062 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,7 @@ autogen-*.md /main /server +/profile-tool # CI @@ -135,4 +136,8 @@ poetry.toml /lora-tests # Video -*.mp4 \ No newline at end of file +*.mp4 + +# fio +fio_test* +*.fio \ No newline at end of file diff --git a/Makefile b/Makefile index a3f1bf04..bcbaf01c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = llama-cli +BUILD_TARGETS = llama-cli profile-tool # BUILD_TARGETS = \ # libllava.a \ # llama-baby-llama \ @@ -1528,6 +1528,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual +profile-tool: tools/profile_tool.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/README.md b/README.md index 71558172..5e40cc2e 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ Here are the models we have tested so far. You can also try more on Hugging Face - **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF) - **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF) - **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF) -- **DeepSeek R1-70B (Q4K, Q6K, Q80):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) +- **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) ## ⚙️ How to Use? @@ -287,7 +287,15 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe > Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU. -**2. How to run in chat mode like in llama.cpp?** +**2. How to manually profile my device?** + +If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device. + +```shell +./profile-tool -m download/qwq-32b-q4_k_m.gguf +``` + +**3. How to run in chat mode like in llama.cpp?** To enable chat (conversation) mode, simply add the `-cnv` flag on the head device: @@ -298,7 +306,7 @@ To enable chat (conversation) mode, simply add the `-cnv` flag on the head devic To quit the chat mode, input `quit` or `exit`. -**3. How to force prefetching after computing?** +**4. How to force prefetching after computing?** By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device: @@ -309,11 +317,11 @@ By default, prima.cpp only advises the OS to prefetch upcoming layer weights. Th This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior. -**4. Does it support Windows?** +**5. Does it support Windows?** Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster. -**5. Does it support Vulkan or AMD GPUs?** +**6. Does it support Vulkan or AMD GPUs?** Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device. diff --git a/common/common.cpp b/common/common.cpp index 704c7335..fd02664d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,13 +901,19 @@ static bool assign_layers_to_device( float t_read_ram_cpu = 0.0f; float t_calc_cpu = ( - master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -921,24 +927,36 @@ static bool assign_layers_to_device( if (dev.gpu_support.metal) { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms } else { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms } @@ -1113,14 +1131,18 @@ static bool assign_layers_to_device( if (m == 0) { kappa = ( - dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms - + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) + + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms @@ -1505,6 +1527,12 @@ static bool assign_layers_to_device( // struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + +#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)) + // reset n_gpu_layers to 0 if GPU is not used + params.n_gpu_layers = 0; +#endif + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); @@ -1554,19 +1582,13 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { uint32_t my_rank = params.rank; bool auto_schedule = params.n_layer_window[0] == 0; - // get device profile - LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); - dev_info.rank = params.rank; - if (n_world > 1) { - llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); - } - // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); if (n_world == 1) { uint32_t n_layers = llama_model_n_layers(model); + // assign all layers to this device params.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers; mparams.n_layer_window[0] = n_layers; @@ -1577,16 +1599,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // initialize sockets llama_init_sockets(lctx, n_world, my_rank); + // broadcast startup args + struct startup_args args; + if (my_rank == 0){ + args.should_profile = auto_schedule; + } + llama_bcast_startup_args(lctx, my_rank, &args); + + auto_schedule = args.should_profile; + // if n_world > 1 and need auto schdule, then prifile + if (auto_schedule){ + // get device profile + LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); + dev_info.rank = params.rank; + if (n_world > 1) { + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + } + } + // sychronize device profile to the master node - struct device_info * dev_info_set = nullptr; if (my_rank == 0) { - dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); - dev_info_set[0] = dev_info; - - llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model, cparams); - if (auto_schedule) { + struct device_info * dev_info_set = nullptr; + dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); + dev_info_set[0] = dev_info; + + llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world, model, cparams); + // automatically determine n_layer_window and n_gpu_layers if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); @@ -1601,7 +1641,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_bcast_layer_setup(lctx, n_layer_window, nullptr); } } else { - llama_send_device_info(lctx, &dev_info); + if (auto_schedule){ + llama_send_device_info(lctx, &dev_info); + } llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } @@ -1766,33 +1808,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, // Added BF16 data type support + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; - throw std::runtime_error("Invalid cache type: " + s); +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); } struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { diff --git a/common/profiler.cpp b/common/profiler.cpp index 69a20af0..461cc3b9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum }; struct ggml_context * ctx = ggml_init(params); + if(n_embd < ggml_blck_size(src0t)){ + n_embd = 2 * ggml_blck_size(src0t); + } struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); @@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ctx_cgraph = ggml_init(params0); gf = ggml_new_graph(ctx_cgraph); + cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b); for (int i = 0; i < n_repeat - 1; i++) { cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur); } + ggml_build_forward_expand(gf, cur); } @@ -345,7 +350,6 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in return 0.0f; } - size_t QK_K = 0; switch (src0t) { case GGML_TYPE_F32: { matrix_B = malloc(embd_size * sizeof(float)); @@ -364,14 +368,18 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in ggml_fp32_to_fp16_row(temp_f32.data(), static_cast(matrix_B), embd_size); break; } + case GGML_TYPE_Q2_K: case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: - QK_K = ggml_blck_size(src0t); - matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t)); + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ1_M: + matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods break; default: LOG_INF("Unsupported type: %d\n", src0t); @@ -1347,33 +1355,47 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c #ifdef GGML_USE_CUDA struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.cuda_flops_iq1m_f32 + EPS) / 1e9; #elif GGML_USE_METAL struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.metal_flops_iq1m_f32 + EPS) / 1e9; #endif - cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - + cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; double total_latency = 0.0f; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1385,13 +1407,18 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += cpu_latency_per_layer * n_layers; #endif - total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; total_latency *= 1000; // convert to ms @@ -1647,474 +1674,664 @@ static float device_mem_copy_delay(struct device_info & dev_info, struct llama_m void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Property "); + LOG_INF("| Property "); for (int i = 0; i < n; ++i) { LOG_INF("| Rank %-8d", i); GGML_ASSERT((int)dev_info_set[i].rank == i); } LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Device Name "); + LOG_INF("| Device Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_name); } LOG_INF("\n"); - LOG_INF("| Device OS "); + LOG_INF("| Device OS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_os); } LOG_INF("\n"); - LOG_INF("| CPU Name "); + LOG_INF("| CPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name); } LOG_INF("\n"); - LOG_INF("| CPU Description "); + LOG_INF("| CPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description); } LOG_INF("\n"); - LOG_INF("| Number of CPU cores "); + LOG_INF("| Number of CPU cores "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores); } LOG_INF("\n"); - LOG_INF("| CPU flops (F32xF32, GFLOPS) "); + LOG_INF("| CPU flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16xF32, GFLOPS) "); + LOG_INF("| CPU flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q4K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q5K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q6K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q6K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); + LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q50 x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q80 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Physical Mem Total (GiB) "); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ1M x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); } LOG_INF("\n"); - LOG_INF("| Physical Mem Available (GiB) "); + LOG_INF("| Physical Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical); } LOG_INF("\n"); - LOG_INF("| Used Mem Swappable (GiB) "); + LOG_INF("| Used Mem Swappable (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.used_can_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Total (GiB) "); + LOG_INF("| Swap Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Available (GiB) "); + LOG_INF("| Swap Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap); } LOG_INF("\n"); - LOG_INF("| CPU RAM Read BW (GB/s) "); + LOG_INF("| CPU RAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw); } LOG_INF("\n"); - LOG_INF("| CPU KVCache Copy Time (ms/l) "); + LOG_INF("| CPU KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Disk Read Seq Speed (GB/s) "); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Seq Speed (GB/s) "); + LOG_INF("| Disk Write Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Read Rnd Speed (GB/s) "); + LOG_INF("| Disk Read Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Rnd Speed (GB/s) "); + LOG_INF("| Disk Write Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw); } LOG_INF("\n"); - LOG_INF("| GPU Metal "); + LOG_INF("| GPU Metal "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal); } LOG_INF("\n"); - LOG_INF("| GPU CUDA "); + LOG_INF("| GPU CUDA "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda); } LOG_INF("\n"); - LOG_INF("| GPU Vulkan "); + LOG_INF("| GPU Vulkan "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan); } LOG_INF("\n"); - LOG_INF("| GPU Kompute "); + LOG_INF("| GPU Kompute "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute); } LOG_INF("\n"); - LOG_INF("| GPU BLAS "); + LOG_INF("| GPU BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas); } LOG_INF("\n"); - LOG_INF("| BLAS "); + LOG_INF("| BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas); } LOG_INF("\n"); - LOG_INF("| SYCL "); + LOG_INF("| SYCL "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl); } LOG_INF("\n"); - LOG_INF("| GPU Name "); + LOG_INF("| GPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name); } LOG_INF("\n"); - LOG_INF("| GPU Description "); + LOG_INF("| GPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description); } LOG_INF("\n"); - LOG_INF("| GPU Mem Free (GiB) "); + LOG_INF("| GPU Mem Free (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free); } LOG_INF("\n"); - LOG_INF("| GPU Mem Total (GiB) "); + LOG_INF("| GPU Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total); } LOG_INF("\n"); - LOG_INF("| Metal VRAM Read BW (GB/s) "); + LOG_INF("| Metal VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| Metal KVCache Copy Time(ms/l)"); + LOG_INF("| Metal KVCache Copy Time(ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Metal flops (F32xF32, GFLOPS)"); + LOG_INF("| Metal flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (F16xF32, GFLOPS)"); + LOG_INF("| Metal flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| Metal flops (Q5KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q6KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); + LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CUDA VRAM Read BW (GB/s) "); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| CUDA KVCache Copy Time (ms/l)"); + LOG_INF("| CUDA KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Model flops (output F32xF32) "); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output F16xF32) "); + LOG_INF("| Model flops (output F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); - LOG_INF("\n"); - - LOG_INF("| Model flops (output Q5KxF32) "); + LOG_INF("| Model flops (output Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q6KxF32) "); + LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q80xF32) "); + LOG_INF("| Model flops (output IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq2xxs_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F32xF32) "); + LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F16xF32) "); + LOG_INF("| Model flops (layer F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); - LOG_INF("\n"); - - LOG_INF("| Model flops (layer Q5KxF32) "); + LOG_INF("| Model flops (layer Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q6KxF32) "); + LOG_INF("| Model flops (layer Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q80xF32) "); + LOG_INF("| Model flops (layer IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq2xxs_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F32) "); + LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F16) "); + LOG_INF("| Model params (input F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); + LOG_INF("| Model params (input Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (input Q5K) "); + LOG_INF("| Model params (input Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k); LOG_INF("\n"); - LOG_INF("| Model params (input Q6K) "); + LOG_INF("| Model params (input Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); - LOG_INF("| Model params (input Q80) "); + LOG_INF("| Model params (input IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); - LOG_INF("| Model params (layer F32) "); + LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); - LOG_INF("| Model params (layer F16) "); + LOG_INF("| Model params (layer F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (layer Q5K) "); + LOG_INF("| Model params (layer Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q6K) "); + LOG_INF("| Model params (layer Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q80) "); + LOG_INF("| Model params (layer IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); - LOG_INF("| Model params (output F32) "); + LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); - LOG_INF("| Model params (output F16) "); + LOG_INF("| Model params (output F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); + LOG_INF("| Model params (output Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (output Q5K) "); + LOG_INF("| Model params (output Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k); LOG_INF("\n"); - LOG_INF("| Model params (output Q6K) "); + LOG_INF("| Model params (output Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); - LOG_INF("| Model params (output Q80) "); + LOG_INF("| Model params (output IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); - LOG_INF("| Model bytes (input) "); + LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); - LOG_INF("| Model bytes (layer) "); + LOG_INF("| Model bytes (layer) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer); LOG_INF("\n"); - LOG_INF("| Model bytes (output) "); + LOG_INF("| Model bytes (output) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output); LOG_INF("\n"); @@ -2155,17 +2372,44 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 12 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, + // - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32 + // - cpu_props.flops_iq2xxs_f32 + // - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32 + // - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32 + // - cpu_props.flops_iq1m_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, - // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32, - // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay + + sizeof(float) * 30; // GPU attributes + // memory: + // - memory_free, memory_total + // - metal_read_vram_bw, cuda_read_vram_bw + // Metal floating-point performance: + // - metal_flops_f32_f32, metal_flops_f16_f32 + // - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32 + // - metal_flops_iq2xxs_f32 + // - metal_flops_q50_f32, metal_flops_q80_f32 + // - metal_flops_iq1s_f32, metal_flops_iq4nl_f32 + // - metal_flops_iq1m_f32 + // CUDA floating-point performance: + // - cuda_flops_f32_f32, cuda_flops_f16_f32 + // - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32 + // - cuda_flops_iq2xxs_f32 + // - cuda_flops_q50_f32, cuda_flops_q80_f32 + // - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32 + // - cuda_flops_iq1m_f32 + // delay: + // - metal_mem_cpy_delay, cuda_mem_cpy_delay *buffer = (char *)malloc(total_size); char * ptr = *buffer; + if (*buffer == NULL) { + LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n", + __func__, total_size); + return 0; + } + // rank memcpy(ptr, &dev_info->rank, sizeof(uint32_t)); ptr += sizeof(uint32_t); @@ -2214,10 +2458,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float)); @@ -2226,9 +2470,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2250,10 +2509,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float)); @@ -2262,9 +2521,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); ptr += sizeof(float); @@ -2277,10 +2551,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float)); @@ -2289,9 +2563,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params @@ -2366,10 +2655,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float)); @@ -2378,9 +2667,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2402,10 +2706,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float)); @@ -2414,9 +2718,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); ptr += sizeof(float); @@ -2429,10 +2748,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float)); @@ -2441,9 +2760,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params diff --git a/common/profiler.h b/common/profiler.h index fb9a4ddb..a685ff8c 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -15,25 +15,36 @@ struct cpu_props { const char * name; const char * description; uint32_t cores; - float flops_f32_f32; // in GFLOPS - float flops_f16_f32; // in GFLOPS - float flops_q4k_f32; // in GFLOPS - float flops_q50_f32; // in GFLOPS - float flops_q5k_f32; // in GFLOPS - float flops_q6k_f32; // in GFLOPS - float flops_q80_f32; // in GFLOPS + float flops_f32_f32; // in GFLOPS + float flops_f16_f32; // in GFLOPS + float flops_q2k_f32; // in GFLOPS + float flops_q4k_f32; // in GFLOPS + float flops_q5k_f32; // in GFLOPS + float flops_q6k_f32; // in GFLOPS + float flops_iq2xxs_f32; // in GFLOPS + float flops_q50_f32; // in GFLOPS + float flops_q80_f32; // in GFLOPS + float flops_iq1s_f32; // in GFLOPS + float flops_iq4nl_f32; // in GFLOPS + float flops_iq1m_f32; // in GFLOPS - cpu_props() : - name(""), - description(""), - cores(0), - flops_f32_f32(0.0f), - flops_f16_f32(0.0f), - flops_q4k_f32(0.0f), - flops_q50_f32(0.0f), - flops_q5k_f32(0.0f), - flops_q6k_f32(0.0f), - flops_q80_f32(0.0f) {} + cpu_props() + : name (""), + description (""), + cores (0), + flops_f32_f32 (0.0f), + flops_f16_f32 (0.0f), + flops_q2k_f32 (0.0f), + flops_q4k_f32 (0.0f), + flops_q5k_f32 (0.0f), + flops_q6k_f32 (0.0f), + flops_iq2xxs_f32(0.0f), + flops_q50_f32 (0.0f), + flops_q80_f32 (0.0f), + flops_iq1s_f32 (0.0f), + flops_iq4nl_f32 (0.0f), + flops_iq1m_f32 (0.0f) + {} }; struct memory_info { @@ -77,132 +88,204 @@ struct gpu_support { struct gpu_props { const char * name; const char * description; - float memory_free; // in GiB - float memory_total; // in GiB - float metal_read_vram_bw; // in GB/s - float metal_flops_f32_f32; // in GFLOPS - float metal_flops_f16_f32; // in GFLOPS - float metal_flops_q4k_f32; // in GFLOPS - float metal_flops_q50_f32; // in GFLOPS - float metal_flops_q5k_f32; // in GFLOPS - float metal_flops_q6k_f32; // in GFLOPS - float metal_flops_q80_f32; // in GFLOPS - float metal_mem_cpy_delay; // in ms - float cuda_read_vram_bw; // in GB/s - float cuda_flops_f32_f32; // in GFLOPS - float cuda_flops_f16_f32; // in GFLOPS - float cuda_flops_q4k_f32; // in GFLOPS - float cuda_flops_q50_f32; // in GFLOPS - float cuda_flops_q5k_f32; // in GFLOPS - float cuda_flops_q6k_f32; // in GFLOPS - float cuda_flops_q80_f32; // in GFLOPS - float cuda_mem_cpy_delay; // in ms + float memory_free; // in GiB + float memory_total; // in GiB + float metal_read_vram_bw; // in GB/s + float metal_flops_f32_f32; // in GFLOPS + float metal_flops_f16_f32; // in GFLOPS + float metal_flops_q2k_f32; // in GFLOPS + float metal_flops_q4k_f32; // in GFLOPS + float metal_flops_q5k_f32; // in GFLOPS + float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_iq2xxs_f32; // in GFLOPS + float metal_flops_q50_f32; // in GFLOPS + float metal_flops_q80_f32; // in GFLOPS + float metal_flops_iq1s_f32; // in GFLOPS + float metal_flops_iq4nl_f32; // in GFLOPS + float metal_flops_iq1m_f32; // in GFLOPS + float metal_mem_cpy_delay; // in ms + float cuda_read_vram_bw; // in GB/s + float cuda_flops_f32_f32; // in GFLOPS + float cuda_flops_f16_f32; // in GFLOPS + float cuda_flops_q2k_f32; // in GFLOPS + float cuda_flops_q4k_f32; // in GFLOPS + float cuda_flops_q5k_f32; // in GFLOPS + float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_iq2xxs_f32; // in GFLOPS + float cuda_flops_q50_f32; // in GFLOPS + float cuda_flops_q80_f32; // in GFLOPS + float cuda_flops_iq1s_f32; // in GFLOPS + float cuda_flops_iq4nl_f32; // in GFLOPS + float cuda_flops_iq1m_f32; // in GFLOPS + float cuda_mem_cpy_delay; // in ms gpu_props() : - name(""), - description(""), - memory_free (0.0f), - memory_total (0.0f), - metal_read_vram_bw (0.0f), - metal_flops_f32_f32(0.0f), - metal_flops_f16_f32(0.0f), - metal_flops_q4k_f32(0.0f), - metal_flops_q50_f32(0.0f), - metal_flops_q5k_f32(0.0f), - metal_flops_q6k_f32(0.0f), - metal_flops_q80_f32(0.0f), - metal_mem_cpy_delay(0.0f), - cuda_read_vram_bw (0.0f), - cuda_flops_f32_f32 (0.0f), - cuda_flops_f16_f32 (0.0f), - cuda_flops_q4k_f32 (0.0f), - cuda_flops_q50_f32 (0.0f), - cuda_flops_q5k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f), - cuda_flops_q80_f32 (0.0f), - cuda_mem_cpy_delay (0.0f) {} + name (""), + description (""), + memory_free (0.0f), + memory_total (0.0f), + metal_read_vram_bw (0.0f), + metal_flops_f32_f32 (0.0f), + metal_flops_f16_f32 (0.0f), + metal_flops_q2k_f32 (0.0f), + metal_flops_q4k_f32 (0.0f), + metal_flops_q5k_f32 (0.0f), + metal_flops_q6k_f32 (0.0f), + metal_flops_iq2xxs_f32 (0.0f), + metal_flops_q50_f32 (0.0f), + metal_flops_q80_f32 (0.0f), + metal_flops_iq1s_f32 (0.0f), + metal_flops_iq4nl_f32 (0.0f), + metal_flops_iq1m_f32 (0.0f), + metal_mem_cpy_delay (0.0f), + cuda_read_vram_bw (0.0f), + cuda_flops_f32_f32 (0.0f), + cuda_flops_f16_f32 (0.0f), + cuda_flops_q2k_f32 (0.0f), + cuda_flops_q4k_f32 (0.0f), + cuda_flops_q5k_f32 (0.0f), + cuda_flops_q6k_f32 (0.0f), + cuda_flops_iq2xxs_f32 (0.0f), + cuda_flops_q50_f32 (0.0f), + cuda_flops_q80_f32 (0.0f), + cuda_flops_iq1s_f32 (0.0f), + cuda_flops_iq4nl_f32 (0.0f), + cuda_flops_iq1m_f32 (0.0f), + cuda_mem_cpy_delay (0.0f) {} }; struct model_flops { float inp_embd_ms; int64_t output_f32_f32; int64_t output_f16_f32; + int64_t output_q2k_f32; int64_t output_q4k_f32; - int64_t output_q50_f32; int64_t output_q5k_f32; int64_t output_q6k_f32; + int64_t output_iq2xxs_f32; + int64_t output_q50_f32; int64_t output_q80_f32; + int64_t output_iq1s_f32; + int64_t output_iq4nl_f32; + int64_t output_iq1m_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; + int64_t layer_q2k_f32; int64_t layer_q4k_f32; - int64_t layer_q50_f32; int64_t layer_q5k_f32; int64_t layer_q6k_f32; + int64_t layer_iq2xxs_f32; + int64_t layer_q50_f32; int64_t layer_q80_f32; + int64_t layer_iq1s_f32; + int64_t layer_iq4nl_f32; + int64_t layer_iq1m_f32; model_flops() : - inp_embd_ms(0.0f), - output_f32_f32(0), - output_f16_f32(0), - output_q4k_f32(0), - output_q50_f32(0), - output_q5k_f32(0), - output_q6k_f32(0), - output_q80_f32(0), - layer_f32_f32 (0), - layer_f16_f32 (0), - layer_q4k_f32 (0), - layer_q50_f32 (0), - layer_q5k_f32 (0), - layer_q6k_f32 (0), - layer_q80_f32 (0) {} + inp_embd_ms (0.0f), + output_f32_f32 (0), + output_f16_f32 (0), + output_q2k_f32 (0), + output_q4k_f32 (0), + output_q5k_f32 (0), + output_q6k_f32 (0), + output_iq2xxs_f32 (0), + output_q50_f32 (0), + output_q80_f32 (0), + output_iq1s_f32 (0), + output_iq4nl_f32 (0), + output_iq1m_f32 (0), + layer_f32_f32 (0), + layer_f16_f32 (0), + layer_q2k_f32 (0), + layer_q4k_f32 (0), + layer_q5k_f32 (0), + layer_q6k_f32 (0), + layer_iq2xxs_f32 (0), + layer_q50_f32 (0), + layer_q80_f32 (0), + layer_iq1s_f32 (0), + layer_iq4nl_f32 (0), + layer_iq1m_f32 (0) + {} }; struct model_params { int64_t input_f32; int64_t input_f16; + int64_t input_q2k; int64_t input_q4k; - int64_t input_q50; int64_t input_q5k; int64_t input_q6k; + int64_t input_iq2xxs; + int64_t input_q50; int64_t input_q80; + int64_t input_iq1s; + int64_t input_iq4nl; + int64_t input_iq1m; int64_t output_f32; int64_t output_f16; + int64_t output_q2k; int64_t output_q4k; - int64_t output_q50; int64_t output_q5k; int64_t output_q6k; + int64_t output_iq2xxs; + int64_t output_q50; int64_t output_q80; + int64_t output_iq1s; + int64_t output_iq4nl; + int64_t output_iq1m; int64_t layer_f32; int64_t layer_f16; + int64_t layer_q2k; int64_t layer_q4k; - int64_t layer_q50; int64_t layer_q5k; int64_t layer_q6k; + int64_t layer_iq2xxs; + int64_t layer_q50; int64_t layer_q80; + int64_t layer_iq1s; + int64_t layer_iq4nl; + int64_t layer_iq1m; model_params() : - input_f32 (0), - input_f16 (0), - input_q4k (0), - input_q50 (0), - input_q5k (0), - input_q6k (0), - input_q80 (0), - output_f32(0), - output_f16(0), - output_q4k(0), - output_q50(0), - output_q5k(0), - output_q6k(0), - output_q80(0), - layer_f32 (0), - layer_f16 (0), - layer_q4k (0), - layer_q50 (0), - layer_q5k (0), - layer_q6k (0), - layer_q80 (0) {} + input_f32 (0), + input_f16 (0), + input_q2k (0), + input_q4k (0), + input_q5k (0), + input_q6k (0), + input_iq2xxs (0), + input_q50 (0), + input_q80 (0), + input_iq1s (0), + input_iq4nl (0), + input_iq1m (0), + output_f32 (0), + output_f16 (0), + output_q2k (0), + output_q4k (0), + output_q5k (0), + output_q6k (0), + output_iq2xxs (0), + output_q50 (0), + output_q80 (0), + output_iq1s (0), + output_iq4nl (0), + output_iq1m (0), + layer_f32 (0), + layer_f16 (0), + layer_q2k (0), + layer_q4k (0), + layer_q5k (0), + layer_q6k (0), + layer_iq2xxs (0), + layer_q50 (0), + layer_q80 (0), + layer_iq1s (0), + layer_iq4nl (0), + layer_iq1m (0) + {} }; struct model_bytes { @@ -229,6 +312,10 @@ struct disk_props { write_rnd_bw(0.0f) {} }; +struct startup_args{ + bool should_profile; +}; + struct device_info { uint32_t rank; const char * device_name; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 962dc032..4af68abc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -385,12 +385,12 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - GGML_TYPE_Q4_0_4_4 = 31, - GGML_TYPE_Q4_0_4_8 = 32, - GGML_TYPE_Q4_0_8_8 = 33, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision @@ -431,9 +431,6 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7aa6dce8..1c57cb95 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); - } break; - case GGML_TYPE_Q4_0_8_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 73426a5d..ffae7f2e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, }, - [GGML_TYPE_Q4_0_4_4] = { - .type_name = "q4_0_4x4", - .blck_size = QK4_0, - .blck_size_interleave = 4, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, - }, - [GGML_TYPE_Q4_0_4_8] = { - .type_name = "q4_0_4x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, - }, - [GGML_TYPE_Q4_0_8_8] = { - .type_name = "q4_0_8x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, - }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", .blck_size = QK_K, @@ -3472,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) { double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } - + const char * ggml_type_name(enum ggml_type type) { return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } @@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; - case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - ///*.padding =*/ { 0 }, + // /*.padding =*/ { 0 }, }; #ifdef __clang__ @@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 5c14d2a3..fd4fec40 100644 --- a/include/llama.h +++ b/include/llama.h @@ -165,18 +165,18 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors @@ -453,6 +453,7 @@ extern "C" { LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); + LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); diff --git a/src/llama.cpp b/src/llama.cpp index 88e13e5a..718327e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_F32: case GGML_TYPE_F16: return true; + case GGML_TYPE_Q2_K: + return n_params->layer_q2k > 0 || n_params->output_q2k > 0; case GGML_TYPE_Q4_K: - return n_params->layer_q4k > 0 || n_params->output_q4k > 0; - case GGML_TYPE_Q5_0: - return n_params->layer_q50 > 0 || n_params->output_q50 > 0; + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; case GGML_TYPE_Q5_K: - return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; case GGML_TYPE_Q6_K: - return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_IQ2_XXS: + return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0; + case GGML_TYPE_Q5_0: + return n_params->layer_q50 > 0 || n_params->output_q50 > 0; case GGML_TYPE_Q8_0: - return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + case GGML_TYPE_IQ1_S: + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + case GGML_TYPE_IQ4_NL: + return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; + case GGML_TYPE_IQ1_M: + return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0; default: throw std::runtime_error("Unrecognized data type\n"); } @@ -3650,18 +3660,18 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) { + dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + } + if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { - dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); @@ -3674,11 +3684,42 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) { + dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { + dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) { + dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) { + dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) { + dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + } } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -4844,9 +4885,7 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; - case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; - case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; + default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { - new_type = GGML_TYPE_Q4_0; - } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -19323,10 +19355,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; - + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - int chunk_size_multiplier = 1; - if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { - if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; @@ -20242,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ return 0; } +int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) { + int32_t n_world = ctx->cparams.n_world; + if (n_world == 1) { + return 0; + } + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + if (rank == 0){ + // send + try { + std::vector send_msgs; + send_msgs.emplace_back("should_profile", strlen("should_profile")); + send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile)); + zmq::send_multipart(*ctx->send_socket, send_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + }else { + // receive + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + GGML_ASSERT(recv_msgs[0].to_string() == "should_profile"); + GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); + bool should_profile = *static_cast(recv_msgs[1].data()); + args->should_profile = should_profile; + if ((int)rank != (int)n_world - 1){ + // send + try { + zmq::send_multipart(*ctx->send_socket, recv_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + } + } + return 0; +} + int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) { uint32_t n_world = ctx->cparams.n_world; if (n_world == 1) { @@ -21049,25 +21109,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_flops->output_f32_f32 += n; + n_flops->output_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->output_f16_f32 += n; + n_flops->output_f16_f32 += n; + break; + case GGML_TYPE_Q2_K: + n_flops->output_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->output_q4k_f32 += n; - break; - case GGML_TYPE_Q5_0: - n_flops->output_q50_f32 += n; + n_flops->output_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->output_q5k_f32 += n; + n_flops->output_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->output_q6k_f32 += n; + n_flops->output_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->output_iq2xxs_f32 += n; + break; + case GGML_TYPE_Q5_0: + n_flops->output_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->output_q80_f32 += n; + n_flops->output_q80_f32 += n; + break; + case GGML_TYPE_IQ1_S: + n_flops->output_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->output_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->output_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21075,27 +21150,42 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en break; case PROFILER_LAYER_BACKEND: - switch (dtype) { + switch (dtype) { case GGML_TYPE_F32: - n_flops->layer_f32_f32 += n; + n_flops->layer_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->layer_f16_f32 += n; + n_flops->layer_f16_f32 += n; + break; + case GGML_TYPE_Q2_K: + n_flops->layer_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->layer_q4k_f32 += n; - break; - case GGML_TYPE_Q5_0: - n_flops->layer_q50_f32 += n; + n_flops->layer_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->layer_q5k_f32 += n; + n_flops->layer_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->layer_q6k_f32 += n; + n_flops->layer_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->layer_iq2xxs_f32 += n; + break; + case GGML_TYPE_Q5_0: + n_flops->layer_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->layer_q80_f32 += n; + n_flops->layer_q80_f32 += n; + break; + case GGML_TYPE_IQ1_S: + n_flops->layer_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->layer_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->layer_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21113,25 +21203,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_INPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->input_f32 += n_i64t; + n_params->input_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->input_f16 += n_i64t; + n_params->input_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->input_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->input_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->input_q50 += n_i64t; + n_params->input_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->input_q5k += n_i64t; + n_params->input_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->input_q6k += n_i64t; + n_params->input_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->input_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->input_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->input_q80 += n_i64t; + n_params->input_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->input_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->input_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->input_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21141,25 +21246,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->output_f32 += n_i64t; + n_params->output_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->output_f16 += n_i64t; + n_params->output_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->output_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->output_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->output_q50 += n_i64t; + n_params->output_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->output_q5k += n_i64t; + n_params->output_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->output_q6k += n_i64t; + n_params->output_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->output_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->output_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->output_q80 += n_i64t; + n_params->output_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->output_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->output_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->output_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21169,25 +21289,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_BACKEND: switch (dtype) { case GGML_TYPE_F32: - n_params->layer_f32 += n_i64t; + n_params->layer_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->layer_f16 += n_i64t; + n_params->layer_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->layer_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->layer_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->layer_q50 += n_i64t; + n_params->layer_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->layer_q5k += n_i64t; + n_params->layer_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->layer_q6k += n_i64t; + n_params->layer_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->layer_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->layer_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->layer_q80 += n_i64t; + n_params->layer_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->layer_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->layer_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->layer_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21477,23 +21612,33 @@ void llama_model_n_flops( } // use average values instead of total values - n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); - n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); - n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); - n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); - n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); - n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); - n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); - - n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); - n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); - n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); - n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); - n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); - n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); - n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); - - n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); + n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); + n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); + n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); + n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); + n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); + n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); + n_flops->layer_iq2xxs_f32 = static_cast((double)n_flops->layer_iq2xxs_f32 / (double)n_layer); + n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); + n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); + n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); + n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + n_flops->layer_iq1m_f32 = static_cast((double)n_flops->layer_iq1m_f32 / (double)n_layer); + + n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); + n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); + n_params->layer_q2k = static_cast((double)n_params->layer_q2k / (double)n_layer); + n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); + n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); + n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); + n_params->layer_iq2xxs = static_cast((double)n_params->layer_iq2xxs / (double)n_layer); + n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); + n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); + n_params->layer_iq1s = static_cast((double)n_params->layer_iq1s / (double)n_layer); + n_params->layer_iq4nl = static_cast((double)n_params->layer_iq4nl / (double)n_layer); + n_params->layer_iq1m = static_cast((double)n_params->layer_iq1m / (double)n_layer); + + n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); // reset ml, model, and clear contexts ml->n_created = 0; diff --git a/tools/profile_tool.cpp b/tools/profile_tool.cpp new file mode 100644 index 00000000..328df2e3 --- /dev/null +++ b/tools/profile_tool.cpp @@ -0,0 +1,62 @@ +#include "arg.h" +#include "common.h" +#include "console.h" +#include "log.h" +#include "llama.h" + +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + return 1; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + // load the model and apply lora adapter, if any + auto mparams = llama_model_params_from_gpt_params(params); + struct llama_context_params cparams = llama_context_params_from_gpt_params(params); + + struct llama_model * model = nullptr; + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } else if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } + + if (model == NULL) { + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); + return -1; + } + + llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams); + + device_info dev_info; + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + device_print_props(&dev_info, 1, model, cparams); + + llama_free_model(model); + return 0; +} \ No newline at end of file