From e2cda4cfa0de50dacc139580a416b5b4dee0c7de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Sat, 1 Mar 2025 14:31:38 +0000 Subject: [PATCH 01/31] Removed support for GGML_TYPE_Q4_0_4_4, GGML_TYPE_0_4_8, and GGML_TYPE_0_8_8 (GGUF no longer supports these types) --- ggml/include/ggml.h | 9 ++--- ggml/src/ggml-quants.c | 9 ----- ggml/src/ggml.c | 75 ------------------------------------------ include/llama.h | 10 +++--- src/llama.cpp | 27 ++------------- 5 files changed, 11 insertions(+), 119 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 962dc032..3052bb65 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -385,9 +385,9 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - GGML_TYPE_Q4_0_4_4 = 31, - GGML_TYPE_Q4_0_4_8 = 32, - GGML_TYPE_Q4_0_8_8 = 33, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, GGML_TYPE_COUNT, @@ -431,9 +431,6 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7aa6dce8..1c57cb95 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); - } break; - case GGML_TYPE_Q4_0_8_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 73426a5d..43a953c6 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, }, - [GGML_TYPE_Q4_0_4_4] = { - .type_name = "q4_0_4x4", - .blck_size = QK4_0, - .blck_size_interleave = 4, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, - }, - [GGML_TYPE_Q4_0_4_8] = { - .type_name = "q4_0_4x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, - }, - [GGML_TYPE_Q4_0_8_8] = { - .type_name = "q4_0_8x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, - }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", .blck_size = QK_K, @@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; - case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 356b0fbf..1d45e7ad 100644 --- a/include/llama.h +++ b/include/llama.h @@ -165,18 +165,18 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 810dd3f7..f782616f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4843,9 +4843,7 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; - case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; - case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; + default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5653,9 +5651,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -18996,10 +18991,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { - new_type = GGML_TYPE_Q4_0; - } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -19322,10 +19313,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; - + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -19645,14 +19633,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - int chunk_size_multiplier = 1; - if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { - if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -19665,8 +19645,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; From 2f049b8428c49e0e6c16dcf463de14922c636195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Tue, 4 Mar 2025 15:22:55 +0000 Subject: [PATCH 02/31] Added support for Q2K, IQ1s, IQ4NL quantization types --- common/common.cpp | 115 ++++++++-------- common/profiler.cpp | 319 ++++++++++++++++++++++++++++++++++++-------- common/profiler.h | 145 +++++++++++++------- src/llama.cpp | 190 ++++++++++++++++++-------- 4 files changed, 551 insertions(+), 218 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ae7dd883..33523467 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,13 +901,17 @@ static bool assign_layers_to_device( float t_read_ram_cpu = 0.0f; float t_calc_cpu = ( - master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+ + master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -921,24 +925,32 @@ static bool assign_layers_to_device( if (dev.gpu_support.metal) { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms } else { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms } @@ -1113,13 +1125,16 @@ static bool assign_layers_to_device( if (m == 0) { kappa = ( - dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -1766,33 +1781,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, // Added BF16 data type support + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; - throw std::runtime_error("Invalid cache type: " + s); +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); } struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { diff --git a/common/profiler.cpp b/common/profiler.cpp index 48af0950..adc9a9e7 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -364,14 +364,16 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in ggml_fp32_to_fp16_row(temp_f32.data(), static_cast(matrix_B), embd_size); break; } + case GGML_TYPE_Q2_K: case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: - QK_K = 256; - matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t)); + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods break; default: LOG_INF("Unsupported type: %d\n", src0t); @@ -1349,31 +1351,39 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; #elif GGML_USE_METAL struct gpu_props gpu = dev_info.gpu_props; gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; #endif cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - + cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; double total_latency = 0.0f; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1387,11 +1397,14 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; total_latency *= 1000; // convert to ms @@ -1696,18 +1709,17 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); + } + LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); - } - LOG_INF("\n"); - LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); @@ -1720,12 +1732,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); @@ -1882,15 +1912,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); + LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); @@ -1906,12 +1936,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); @@ -1936,15 +1984,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); @@ -1960,12 +2008,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); @@ -1974,12 +2040,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); + LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (output Q5KxF32) "); @@ -1989,11 +2055,23 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); + + LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("\n"); LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); + LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); @@ -2002,12 +2080,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); + LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer Q5KxF32) "); @@ -2018,10 +2096,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); + LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); + LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); + LOG_INF("\n"); + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); @@ -2030,12 +2120,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); + LOG_INF("| Model params (input Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("| Model params (input Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); LOG_INF("| Model params (input Q5K) "); @@ -2046,10 +2136,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); + LOG_INF("| Model params (input Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("\n"); + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); + LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); @@ -2058,12 +2160,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); + LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); LOG_INF("| Model params (layer Q5K) "); @@ -2074,10 +2176,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); + LOG_INF("| Model params (layer Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("\n"); + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); + LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); @@ -2086,12 +2200,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); + LOG_INF("| Model params (output Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("| Model params (output Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); LOG_INF("| Model params (output Q5K) "); @@ -2102,10 +2216,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); + LOG_INF("| Model params (output Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("\n"); + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); + LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); @@ -2155,17 +2281,38 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 10 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, + // - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32 + // - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32 + // - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, - // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32, - // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay + + sizeof(float) * 26; // GPU attributes + // memory: + // - memory_free, memory_total + // - metal_read_vram_bw, cuda_read_vram_bw + // Metal floating-point performance: + // - metal_flops_f32_f32, metal_flops_f16_f32 + // - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32 + // - metal_flops_q50_f32, metal_flops_q80_f32 + // - metal_flops_iq1s_f32, metal_flops_iq4nl_f32 + // CUDA floating-point performance: + // - cuda_flops_f32_f32, cuda_flops_f16_f32 + // - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32 + // - cuda_flops_q50_f32, cuda_flops_q80_f32 + // - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32 + // delay: + // - metal_mem_cpy_delay, cuda_mem_cpy_delay *buffer = (char *)malloc(total_size); char * ptr = *buffer; + if (*buffer == NULL) { + LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n", + __func__, total_size); + return 0; + } + // rank memcpy(ptr, &dev_info->rank, sizeof(uint32_t)); ptr += sizeof(uint32_t); @@ -2214,10 +2361,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float)); @@ -2226,9 +2373,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2250,10 +2406,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float)); @@ -2262,9 +2418,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); ptr += sizeof(float); @@ -2277,10 +2442,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float)); @@ -2289,9 +2454,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params @@ -2366,10 +2540,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float)); @@ -2378,9 +2552,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2402,10 +2585,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float)); @@ -2414,9 +2597,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); ptr += sizeof(float); @@ -2429,10 +2621,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float)); @@ -2441,9 +2633,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params diff --git a/common/profiler.h b/common/profiler.h index fb9a4ddb..0681a711 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -17,23 +17,30 @@ struct cpu_props { uint32_t cores; float flops_f32_f32; // in GFLOPS float flops_f16_f32; // in GFLOPS + float flops_q2k_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS - float flops_q50_f32; // in GFLOPS float flops_q5k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS + float flops_q50_f32; // in GFLOPS float flops_q80_f32; // in GFLOPS + float flops_iq1s_f32; // in GFLOPS + float flops_iq4nl_f32; // in GFLOPS - cpu_props() : - name(""), - description(""), - cores(0), - flops_f32_f32(0.0f), - flops_f16_f32(0.0f), - flops_q4k_f32(0.0f), - flops_q50_f32(0.0f), - flops_q5k_f32(0.0f), - flops_q6k_f32(0.0f), - flops_q80_f32(0.0f) {} + cpu_props() + : name (""), + description (""), + cores (0), + flops_f32_f32 (0.0f), + flops_f16_f32 (0.0f), + flops_q2k_f32 (0.0f), + flops_q4k_f32 (0.0f), + flops_q5k_f32 (0.0f), + flops_q6k_f32 (0.0f), + flops_q50_f32 (0.0f), + flops_q80_f32 (0.0f), + flops_iq1s_f32 (0.0f), + flops_iq4nl_f32 (0.0f) + {} }; struct memory_info { @@ -82,127 +89,169 @@ struct gpu_props { float metal_read_vram_bw; // in GB/s float metal_flops_f32_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS + float metal_flops_q2k_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS - float metal_flops_q50_f32; // in GFLOPS float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_q50_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS + float metal_flops_iq1s_f32; // in GFLOPS + float metal_flops_iq4nl_f32; // in GFLOPS float metal_mem_cpy_delay; // in ms float cuda_read_vram_bw; // in GB/s float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS + float cuda_flops_q2k_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS - float cuda_flops_q50_f32; // in GFLOPS float cuda_flops_q5k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_q50_f32; // in GFLOPS float cuda_flops_q80_f32; // in GFLOPS + float cuda_flops_iq1s_f32; // in GFLOPS + float cuda_flops_iq4nl_f32; // in GFLOPS float cuda_mem_cpy_delay; // in ms gpu_props() : - name(""), - description(""), - memory_free (0.0f), - memory_total (0.0f), - metal_read_vram_bw (0.0f), - metal_flops_f32_f32(0.0f), - metal_flops_f16_f32(0.0f), - metal_flops_q4k_f32(0.0f), - metal_flops_q50_f32(0.0f), - metal_flops_q5k_f32(0.0f), - metal_flops_q6k_f32(0.0f), - metal_flops_q80_f32(0.0f), - metal_mem_cpy_delay(0.0f), - cuda_read_vram_bw (0.0f), - cuda_flops_f32_f32 (0.0f), - cuda_flops_f16_f32 (0.0f), - cuda_flops_q4k_f32 (0.0f), - cuda_flops_q50_f32 (0.0f), - cuda_flops_q5k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f), - cuda_flops_q80_f32 (0.0f), - cuda_mem_cpy_delay (0.0f) {} + name (""), + description (""), + memory_free (0.0f), + memory_total (0.0f), + metal_read_vram_bw (0.0f), + metal_flops_f32_f32 (0.0f), + metal_flops_f16_f32 (0.0f), + metal_flops_q2k_f32 (0.0f), + metal_flops_q4k_f32 (0.0f), + metal_flops_q5k_f32 (0.0f), + metal_flops_q6k_f32 (0.0f), + metal_flops_q50_f32 (0.0f), + metal_flops_q80_f32 (0.0f), + metal_flops_iq1s_f32 (0.0f), + metal_flops_iq4nl_f32 (0.0f), + metal_mem_cpy_delay (0.0f), + cuda_read_vram_bw (0.0f), + cuda_flops_f32_f32 (0.0f), + cuda_flops_f16_f32 (0.0f), + cuda_flops_q2k_f32 (0.0f), + cuda_flops_q4k_f32 (0.0f), + cuda_flops_q5k_f32 (0.0f), + cuda_flops_q6k_f32 (0.0f), + cuda_flops_q50_f32 (0.0f), + cuda_flops_q80_f32 (0.0f), + cuda_flops_iq1s_f32 (0.0f), + cuda_flops_iq4nl_f32 (0.0f), + cuda_mem_cpy_delay (0.0f) {} }; struct model_flops { float inp_embd_ms; int64_t output_f32_f32; int64_t output_f16_f32; + int64_t output_q2k_f32; int64_t output_q4k_f32; - int64_t output_q50_f32; int64_t output_q5k_f32; int64_t output_q6k_f32; + int64_t output_q50_f32; int64_t output_q80_f32; + int64_t output_iq1s_f32; + int64_t output_iq4nl_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; + int64_t layer_q2k_f32; int64_t layer_q4k_f32; - int64_t layer_q50_f32; int64_t layer_q5k_f32; int64_t layer_q6k_f32; + int64_t layer_q50_f32; int64_t layer_q80_f32; + int64_t layer_iq1s_f32; + int64_t layer_iq4nl_f32; model_flops() : inp_embd_ms(0.0f), output_f32_f32(0), output_f16_f32(0), + output_q2k_f32(0), output_q4k_f32(0), - output_q50_f32(0), output_q5k_f32(0), output_q6k_f32(0), + output_q50_f32(0), output_q80_f32(0), + output_iq1s_f32(0), + output_iq4nl_f32(0), layer_f32_f32 (0), layer_f16_f32 (0), + layer_q2k_f32 (0), layer_q4k_f32 (0), - layer_q50_f32 (0), layer_q5k_f32 (0), layer_q6k_f32 (0), - layer_q80_f32 (0) {} + layer_q50_f32 (0), + layer_q80_f32 (0), + layer_iq1s_f32 (0), + layer_iq4nl_f32 (0) {} }; struct model_params { int64_t input_f32; int64_t input_f16; + int64_t input_q2k; int64_t input_q4k; - int64_t input_q50; int64_t input_q5k; int64_t input_q6k; + int64_t input_q50; int64_t input_q80; + int64_t input_iq1s; + int64_t input_iq4nl; int64_t output_f32; int64_t output_f16; + int64_t output_q2k; int64_t output_q4k; - int64_t output_q50; int64_t output_q5k; int64_t output_q6k; + int64_t output_q50; int64_t output_q80; + int64_t output_iq1s; + int64_t output_iq4nl; int64_t layer_f32; int64_t layer_f16; + int64_t layer_q2k; int64_t layer_q4k; - int64_t layer_q50; int64_t layer_q5k; int64_t layer_q6k; + int64_t layer_q50; int64_t layer_q80; + int64_t layer_iq1s; + int64_t layer_iq4nl; model_params() : input_f32 (0), input_f16 (0), + input_q2k (0), input_q4k (0), - input_q50 (0), input_q5k (0), input_q6k (0), + input_q50 (0), input_q80 (0), + input_iq1s(0), + input_iq4nl(0), output_f32(0), output_f16(0), + output_q2k(0), output_q4k(0), - output_q50(0), output_q5k(0), output_q6k(0), + output_q50(0), output_q80(0), + output_iq1s(0), + output_iq4nl(0), layer_f32 (0), layer_f16 (0), + layer_q2k (0), layer_q4k (0), - layer_q50 (0), layer_q5k (0), layer_q6k (0), - layer_q80 (0) {} + layer_q50 (0), + layer_q80 (0), + layer_iq1s (0), + layer_iq4nl (0) {} }; struct model_bytes { diff --git a/src/llama.cpp b/src/llama.cpp index f782616f..51ef97c8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3559,16 +3559,22 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_F32: case GGML_TYPE_F16: return true; + case GGML_TYPE_Q2_K: + return n_params->layer_q2k > 0 || n_params->output_q2k > 0; case GGML_TYPE_Q4_K: - return n_params->layer_q4k > 0 || n_params->output_q4k > 0; - case GGML_TYPE_Q5_0: - return n_params->layer_q50 > 0 || n_params->output_q50 > 0; + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; case GGML_TYPE_Q5_K: - return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; case GGML_TYPE_Q6_K: - return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_Q5_0: + return n_params->layer_q50 > 0 || n_params->output_q50 > 0; case GGML_TYPE_Q8_0: - return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + case GGML_TYPE_IQ1_S: + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + case GGML_TYPE_IQ4_NL: + return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; default: throw std::runtime_error("Unrecognized data type\n"); } @@ -3649,18 +3655,18 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) { + dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + } + if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { - dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); @@ -3673,11 +3679,30 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { + dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) { + dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) { + dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + } } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -21029,49 +21054,67 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_F16: n_flops->output_f16_f32 += n; break; + case GGML_TYPE_Q2_K: + n_flops->output_q2k_f32 += n; + break; case GGML_TYPE_Q4_K: n_flops->output_q4k_f32 += n; break; - case GGML_TYPE_Q5_0: - n_flops->output_q50_f32 += n; - break; case GGML_TYPE_Q5_K: n_flops->output_q5k_f32 += n; break; case GGML_TYPE_Q6_K: n_flops->output_q6k_f32 += n; break; + case GGML_TYPE_Q5_0: + n_flops->output_q50_f32 += n; + break; case GGML_TYPE_Q8_0: n_flops->output_q80_f32 += n; break; + case GGML_TYPE_IQ1_S: + n_flops->output_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->output_iq4nl_f32 += n; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } break; case PROFILER_LAYER_BACKEND: - switch (dtype) { + switch (dtype) { case GGML_TYPE_F32: n_flops->layer_f32_f32 += n; break; case GGML_TYPE_F16: n_flops->layer_f16_f32 += n; break; + case GGML_TYPE_Q2_K: + n_flops->layer_q2k_f32 += n; + break; case GGML_TYPE_Q4_K: n_flops->layer_q4k_f32 += n; break; - case GGML_TYPE_Q5_0: - n_flops->layer_q50_f32 += n; - break; case GGML_TYPE_Q5_K: n_flops->layer_q5k_f32 += n; break; case GGML_TYPE_Q6_K: n_flops->layer_q6k_f32 += n; break; + case GGML_TYPE_Q5_0: + n_flops->layer_q50_f32 += n; + break; case GGML_TYPE_Q8_0: n_flops->layer_q80_f32 += n; break; + case GGML_TYPE_IQ1_S: + n_flops->layer_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->layer_iq4nl_f32 += n; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); } @@ -21093,21 +21136,30 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_F16: n_params->input_f16 += n_i64t; break; + case GGML_TYPE_Q2_K: + n_params->input_q2k += n_i64t; + break; case GGML_TYPE_Q4_K: n_params->input_q4k += n_i64t; break; - case GGML_TYPE_Q5_0: - n_params->input_q50 += n_i64t; - break; case GGML_TYPE_Q5_K: n_params->input_q5k += n_i64t; break; case GGML_TYPE_Q6_K: n_params->input_q6k += n_i64t; break; + case GGML_TYPE_Q5_0: + n_params->input_q50 += n_i64t; + break; case GGML_TYPE_Q8_0: n_params->input_q80 += n_i64t; break; + case GGML_TYPE_IQ1_S: + n_params->input_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->input_iq4nl += n_i64t; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } @@ -21116,25 +21168,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->output_f32 += n_i64t; + n_params->output_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->output_f16 += n_i64t; + n_params->output_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->output_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->output_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->output_q50 += n_i64t; + n_params->output_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->output_q5k += n_i64t; + n_params->output_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->output_q6k += n_i64t; + n_params->output_q6k += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->output_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->output_q80 += n_i64t; + n_params->output_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->output_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->output_iq4nl += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21144,25 +21205,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_BACKEND: switch (dtype) { case GGML_TYPE_F32: - n_params->layer_f32 += n_i64t; + n_params->layer_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->layer_f16 += n_i64t; + n_params->layer_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->layer_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->layer_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->layer_q50 += n_i64t; + n_params->layer_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->layer_q5k += n_i64t; + n_params->layer_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->layer_q6k += n_i64t; + n_params->layer_q6k += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->layer_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->layer_q80 += n_i64t; + n_params->layer_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->layer_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->layer_iq4nl += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21452,23 +21522,29 @@ void llama_model_n_flops( } // use average values instead of total values - n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); - n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); - n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); - n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); - n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); - n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); - n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); - - n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); - n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); - n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); - n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); - n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); - n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); - n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); - - n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); + n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); + n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); + n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); + n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); + n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); + n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); + n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); + n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); + n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); + n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + + n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); + n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); + n_params->layer_q2k = static_cast((double)n_params->layer_q2k / (double)n_layer); + n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); + n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); + n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); + n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); + n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); + n_params->layer_iq1s = static_cast((double)n_params->layer_iq1s / (double)n_layer); + n_params->layer_iq4nl = static_cast((double)n_params->layer_iq4nl / (double)n_layer); + + n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); // reset ml, model, and clear contexts ml->n_created = 0; From 54c4c1c26ef79827a0074007cb97ad036695d59c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Fri, 7 Mar 2025 02:47:00 +0000 Subject: [PATCH 03/31] Fixed the flops test for iq1s and q2k quantization types --- common/profiler.cpp | 6 ++++++ ggml/include/ggml.h | 2 +- ggml/src/ggml.c | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index adc9a9e7..e262b6a2 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum }; struct ggml_context * ctx = ggml_init(params); + if(n_embd < ggml_blck_size(src0t)){ + n_embd = 2 * ggml_blck_size(src0t); + } struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); @@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ctx_cgraph = ggml_init(params0); gf = ggml_new_graph(ctx_cgraph); + cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b); for (int i = 0; i < n_repeat - 1; i++) { cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur); } + ggml_build_forward_expand(gf, cur); } @@ -1713,6 +1718,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); } + LOG_INF("\n"); LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3052bb65..4af68abc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -390,7 +390,7 @@ extern "C" { // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 43a953c6..ffae7f2e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3424,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) { double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } - + const char * ggml_type_name(enum ggml_type type) { return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } @@ -4056,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - ///*.padding =*/ { 0 }, + // /*.padding =*/ { 0 }, }; #ifdef __clang__ From 6a416534c86e7da047953933e115a3a506ce142f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Fri, 7 Mar 2025 07:46:30 +0000 Subject: [PATCH 04/31] Fixed the alignment display of device performance --- common/profiler.cpp | 220 ++++++++++++++++++++++---------------------- src/llama.cpp | 2 +- 2 files changed, 111 insertions(+), 111 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index e262b6a2..9c00f535 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1665,92 +1665,92 @@ static float device_mem_copy_delay(struct device_info & dev_info, struct llama_m void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Property "); + LOG_INF("| Property "); for (int i = 0; i < n; ++i) { LOG_INF("| Rank %-8d", i); GGML_ASSERT((int)dev_info_set[i].rank == i); } LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Device Name "); + LOG_INF("| Device Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_name); } LOG_INF("\n"); - LOG_INF("| Device OS "); + LOG_INF("| Device OS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_os); } LOG_INF("\n"); - LOG_INF("| CPU Name "); + LOG_INF("| CPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name); } LOG_INF("\n"); - LOG_INF("| CPU Description "); + LOG_INF("| CPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description); } LOG_INF("\n"); - LOG_INF("| Number of CPU cores "); + LOG_INF("| Number of CPU cores "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores); } LOG_INF("\n"); - LOG_INF("| CPU flops (F32xF32, GFLOPS) "); + LOG_INF("| CPU flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16xF32, GFLOPS) "); + LOG_INF("| CPU flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q2K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q4K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q5K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q6K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q6K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q50 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q80 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (IQ1S x F32, GFLOPS)"); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); } @@ -1762,199 +1762,199 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Physical Mem Total (GiB) "); + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); } LOG_INF("\n"); - LOG_INF("| Physical Mem Available (GiB) "); + LOG_INF("| Physical Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical); } LOG_INF("\n"); - LOG_INF("| Used Mem Swappable (GiB) "); + LOG_INF("| Used Mem Swappable (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.used_can_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Total (GiB) "); + LOG_INF("| Swap Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Available (GiB) "); + LOG_INF("| Swap Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap); } LOG_INF("\n"); - LOG_INF("| CPU RAM Read BW (GB/s) "); + LOG_INF("| CPU RAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw); } LOG_INF("\n"); - LOG_INF("| CPU KVCache Copy Time (ms/l) "); + LOG_INF("| CPU KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Disk Read Seq Speed (GB/s) "); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Seq Speed (GB/s) "); + LOG_INF("| Disk Write Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Read Rnd Speed (GB/s) "); + LOG_INF("| Disk Read Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Rnd Speed (GB/s) "); + LOG_INF("| Disk Write Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw); } LOG_INF("\n"); - LOG_INF("| GPU Metal "); + LOG_INF("| GPU Metal "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal); } LOG_INF("\n"); - LOG_INF("| GPU CUDA "); + LOG_INF("| GPU CUDA "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda); } LOG_INF("\n"); - LOG_INF("| GPU Vulkan "); + LOG_INF("| GPU Vulkan "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan); } LOG_INF("\n"); - LOG_INF("| GPU Kompute "); + LOG_INF("| GPU Kompute "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute); } LOG_INF("\n"); - LOG_INF("| GPU BLAS "); + LOG_INF("| GPU BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas); } LOG_INF("\n"); - LOG_INF("| BLAS "); + LOG_INF("| BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas); } LOG_INF("\n"); - LOG_INF("| SYCL "); + LOG_INF("| SYCL "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl); } LOG_INF("\n"); - LOG_INF("| GPU Name "); + LOG_INF("| GPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name); } LOG_INF("\n"); - LOG_INF("| GPU Description "); + LOG_INF("| GPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description); } LOG_INF("\n"); - LOG_INF("| GPU Mem Free (GiB) "); + LOG_INF("| GPU Mem Free (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free); } LOG_INF("\n"); - LOG_INF("| GPU Mem Total (GiB) "); + LOG_INF("| GPU Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total); } LOG_INF("\n"); - LOG_INF("| Metal VRAM Read BW (GB/s) "); + LOG_INF("| Metal VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| Metal KVCache Copy Time(ms/l)"); + LOG_INF("| Metal KVCache Copy Time(ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Metal flops (F32xF32, GFLOPS)"); + LOG_INF("| Metal flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (F16xF32, GFLOPS)"); + LOG_INF("| Metal flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q2KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q5KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q6KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); + LOG_INF("| Metal flops (Q50xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); + LOG_INF("| Metal flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (IQ1SxF32, GFLOPS)"); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); } @@ -1966,67 +1966,67 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CUDA VRAM Read BW (GB/s) "); + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| CUDA KVCache Copy Time (ms/l)"); + LOG_INF("| CUDA KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); } @@ -2038,39 +2038,39 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Model flops (output F32xF32) "); + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output F16xF32) "); + LOG_INF("| Model flops (output F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| Model flops (output Q2KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| Model flops (output Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q5KxF32) "); + LOG_INF("| Model flops (output Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q6KxF32) "); + LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| Model flops (output Q50xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q80xF32) "); + LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| Model flops (output IQ1SxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); LOG_INF("\n"); @@ -2078,131 +2078,131 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F32xF32) "); + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F16xF32) "); + LOG_INF("| Model flops (layer F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| Model flops (layer Q2KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| Model flops (layer Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q5KxF32) "); + LOG_INF("| Model flops (layer Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q6KxF32) "); + LOG_INF("| Model flops (layer Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| Model flops (layer Q50xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q80xF32) "); + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| Model flops (layer IQ1SxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| Model flops (layer IQ4NLxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F32) "); + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F16) "); + LOG_INF("| Model params (input F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q2K) "); + LOG_INF("| Model params (input Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); + LOG_INF("| Model params (input Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); - LOG_INF("| Model params (input Q5K) "); + LOG_INF("| Model params (input Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k); LOG_INF("\n"); - LOG_INF("| Model params (input Q6K) "); + LOG_INF("| Model params (input Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); + LOG_INF("| Model params (input Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); LOG_INF("\n"); - LOG_INF("| Model params (input Q80) "); + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); - LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| Model params (input IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| Model params (input IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); LOG_INF("\n"); - LOG_INF("| Model params (layer F32) "); + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); - LOG_INF("| Model params (layer F16) "); + LOG_INF("| Model params (layer F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| Model params (layer Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| Model params (layer Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q5K) "); + LOG_INF("| Model params (layer Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q6K) "); + LOG_INF("| Model params (layer Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); + LOG_INF("| Model params (layer Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); LOG_INF("\n"); - LOG_INF("| Model params (layer Q80) "); + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); - LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| Model params (layer IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| Model params (layer IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); LOG_INF("\n"); - LOG_INF("| Model params (output F32) "); + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); - LOG_INF("| Model params (output F16) "); + LOG_INF("| Model params (output F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); @@ -2210,43 +2210,43 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); + LOG_INF("| Model params (output Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); - LOG_INF("| Model params (output Q5K) "); + LOG_INF("| Model params (output Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k); LOG_INF("\n"); - LOG_INF("| Model params (output Q6K) "); + LOG_INF("| Model params (output Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); + LOG_INF("| Model params (output Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); LOG_INF("\n"); - LOG_INF("| Model params (output Q80) "); + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); - LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| Model params (output IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| Model params (output IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); LOG_INF("\n"); - LOG_INF("| Model bytes (input) "); + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); - LOG_INF("| Model bytes (layer) "); + LOG_INF("| Model bytes (layer) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer); LOG_INF("\n"); - LOG_INF("| Model bytes (output) "); + LOG_INF("| Model bytes (output) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output); LOG_INF("\n"); diff --git a/src/llama.cpp b/src/llama.cpp index 51ef97c8..2ac20007 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3572,7 +3572,7 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_Q8_0: return n_params->layer_q80 > 0 || n_params->output_q80 > 0; case GGML_TYPE_IQ1_S: - return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; case GGML_TYPE_IQ4_NL: return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; default: From 230c68b80cad34e634077261290b3fe5084f6c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Fri, 7 Mar 2025 07:55:23 +0000 Subject: [PATCH 05/31] fixed the alignment display --- common/profiler.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 9c00f535..269d5ba3 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1810,7 +1810,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Disk Write Seq Speed (GB/s) "); + LOG_INF("| Disk Write Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); } @@ -2206,7 +2206,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q2K) "); + LOG_INF("| Model params (output Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); LOG_INF("\n"); @@ -2234,7 +2234,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| Model params (output IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); LOG_INF("\n"); From 45ec52c2cb896687162e6516068f8abe57dd4092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?leeetao=C2=A0?= <3122669219@qq.com> Date: Fri, 7 Mar 2025 16:56:16 +0000 Subject: [PATCH 06/31] Added support for IQ1_M and IQ2_XXS quantization type --- common/common.cpp | 89 +++++---- common/profiler.cpp | 444 ++++++++++++++++++++++++++++---------------- common/profiler.h | 210 ++++++++++++--------- src/llama.cpp | 148 ++++++++++----- 4 files changed, 555 insertions(+), 336 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 33523467..a37dfeea 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,16 +901,18 @@ static bool assign_layers_to_device( float t_read_ram_cpu = 0.0f; float t_calc_cpu = ( - master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + - master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+ - master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -925,31 +927,35 @@ static bool assign_layers_to_device( if (dev.gpu_support.metal) { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + - master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + - master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms } else { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + - master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + - master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms @@ -1125,17 +1131,18 @@ static bool assign_layers_to_device( if (m == 0) { kappa = ( - dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + - dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + - dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms - + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) + + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms diff --git a/common/profiler.cpp b/common/profiler.cpp index 269d5ba3..18b345a9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -374,10 +374,12 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ2_XXS: case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ1_M: matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods break; default: @@ -1354,41 +1356,47 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c #ifdef GGML_USE_CUDA struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.cuda_flops_iq1m_f32 + EPS) / 1e9; #elif GGML_USE_METAL struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.metal_flops_iq1m_f32 + EPS) / 1e9; #endif - cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; double total_latency = 0.0f; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1400,16 +1408,18 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += cpu_latency_per_layer * n_layers; #endif - total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; total_latency *= 1000; // convert to ms @@ -1665,588 +1675,664 @@ static float device_mem_copy_delay(struct device_info & dev_info, struct llama_m void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Property "); + LOG_INF("| Property "); for (int i = 0; i < n; ++i) { LOG_INF("| Rank %-8d", i); GGML_ASSERT((int)dev_info_set[i].rank == i); } LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Device Name "); + LOG_INF("| Device Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_name); } LOG_INF("\n"); - LOG_INF("| Device OS "); + LOG_INF("| Device OS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_os); } LOG_INF("\n"); - LOG_INF("| CPU Name "); + LOG_INF("| CPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name); } LOG_INF("\n"); - LOG_INF("| CPU Description "); + LOG_INF("| CPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description); } LOG_INF("\n"); - LOG_INF("| Number of CPU cores "); + LOG_INF("| Number of CPU cores "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores); } LOG_INF("\n"); - LOG_INF("| CPU flops (F32xF32, GFLOPS) "); + LOG_INF("| CPU flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16xF32, GFLOPS) "); + LOG_INF("| CPU flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q2K x F32, GFLOPS) "); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q4K x F32, GFLOPS) "); + LOG_INF("| CPU flops (Q4K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q5K x F32, GFLOPS) "); + LOG_INF("| CPU flops (Q5K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q6K x F32, GFLOPS) "); + LOG_INF("| CPU flops (Q6K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS) "); + LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q50 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q80 x F32, GFLOPS) "); + LOG_INF("| CPU flops (Q80 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) "); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS)"); + LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32); } LOG_INF("\n"); - LOG_INF("| Physical Mem Total (GiB) "); + LOG_INF("| CPU flops (IQ1M x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); } LOG_INF("\n"); - LOG_INF("| Physical Mem Available (GiB) "); + LOG_INF("| Physical Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical); } LOG_INF("\n"); - LOG_INF("| Used Mem Swappable (GiB) "); + LOG_INF("| Used Mem Swappable (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.used_can_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Total (GiB) "); + LOG_INF("| Swap Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Available (GiB) "); + LOG_INF("| Swap Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap); } LOG_INF("\n"); - LOG_INF("| CPU RAM Read BW (GB/s) "); + LOG_INF("| CPU RAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw); } LOG_INF("\n"); - LOG_INF("| CPU KVCache Copy Time (ms/l) "); + LOG_INF("| CPU KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Disk Read Seq Speed (GB/s) "); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Seq Speed (GB/s) "); + LOG_INF("| Disk Write Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Read Rnd Speed (GB/s) "); + LOG_INF("| Disk Read Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Rnd Speed (GB/s) "); + LOG_INF("| Disk Write Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw); } LOG_INF("\n"); - LOG_INF("| GPU Metal "); + LOG_INF("| GPU Metal "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal); } LOG_INF("\n"); - LOG_INF("| GPU CUDA "); + LOG_INF("| GPU CUDA "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda); } LOG_INF("\n"); - LOG_INF("| GPU Vulkan "); + LOG_INF("| GPU Vulkan "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan); } LOG_INF("\n"); - LOG_INF("| GPU Kompute "); + LOG_INF("| GPU Kompute "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute); } LOG_INF("\n"); - LOG_INF("| GPU BLAS "); + LOG_INF("| GPU BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas); } LOG_INF("\n"); - LOG_INF("| BLAS "); + LOG_INF("| BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas); } LOG_INF("\n"); - LOG_INF("| SYCL "); + LOG_INF("| SYCL "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl); } LOG_INF("\n"); - LOG_INF("| GPU Name "); + LOG_INF("| GPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name); } LOG_INF("\n"); - LOG_INF("| GPU Description "); + LOG_INF("| GPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description); } LOG_INF("\n"); - LOG_INF("| GPU Mem Free (GiB) "); + LOG_INF("| GPU Mem Free (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free); } LOG_INF("\n"); - LOG_INF("| GPU Mem Total (GiB) "); + LOG_INF("| GPU Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total); } LOG_INF("\n"); - LOG_INF("| Metal VRAM Read BW (GB/s) "); + LOG_INF("| Metal VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| Metal KVCache Copy Time(ms/l) "); + LOG_INF("| Metal KVCache Copy Time(ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Metal flops (F32xF32, GFLOPS) "); + LOG_INF("| Metal flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (F16xF32, GFLOPS) "); + LOG_INF("| Metal flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q2KxF32, GFLOPS) "); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS) "); + LOG_INF("| Metal flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q5KxF32, GFLOPS) "); + LOG_INF("| Metal flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q6KxF32, GFLOPS) "); + LOG_INF("| Metal flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS) "); + LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q50xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q80xF32, GFLOPS) "); + LOG_INF("| Metal flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) "); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS)"); + LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32); } LOG_INF("\n"); - LOG_INF("| CUDA VRAM Read BW (GB/s) "); + LOG_INF("| Metal flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| CUDA KVCache Copy Time (ms/l) "); + LOG_INF("| CUDA KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32); } LOG_INF("\n"); - LOG_INF("| Model flops (output F32xF32) "); + LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output F16xF32) "); + LOG_INF("| Model flops (output F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| Model flops (output Q2KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| Model flops (output Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q5KxF32) "); + LOG_INF("| Model flops (output Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q6KxF32) "); + LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq2xxs_f32); + LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| Model flops (output Q50xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q80xF32) "); + LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| Model flops (output IQ1SxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output IQ4NLxF32) "); + LOG_INF("| Model flops (output IQ4NLxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F32xF32) "); + LOG_INF("| Model flops (output IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F16xF32) "); + LOG_INF("| Model flops (layer F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| Model flops (layer Q2KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| Model flops (layer Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q5KxF32) "); + LOG_INF("| Model flops (layer Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q6KxF32) "); + LOG_INF("| Model flops (layer Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| Model flops (layer IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq2xxs_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q50xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q80xF32) "); + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| Model flops (layer IQ1SxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| Model flops (layer IQ4NLxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F32) "); + LOG_INF("| Model flops (layer IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F16) "); + LOG_INF("| Model params (input F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q2K) "); + LOG_INF("| Model params (input Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); + LOG_INF("| Model params (input Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); - LOG_INF("| Model params (input Q5K) "); + LOG_INF("| Model params (input Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k); LOG_INF("\n"); - LOG_INF("| Model params (input Q6K) "); + LOG_INF("| Model params (input Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); + LOG_INF("| Model params (input IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); LOG_INF("\n"); - LOG_INF("| Model params (input Q80) "); + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); - LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| Model params (input IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| Model params (input IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); LOG_INF("\n"); - LOG_INF("| Model params (layer F32) "); + LOG_INF("| Model params (input IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); - LOG_INF("| Model params (layer F16) "); + LOG_INF("| Model params (layer F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| Model params (layer Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| Model params (layer Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q5K) "); + LOG_INF("| Model params (layer Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q6K) "); + LOG_INF("| Model params (layer Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); + LOG_INF("| Model params (layer IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); LOG_INF("\n"); - LOG_INF("| Model params (layer Q80) "); + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); - LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| Model params (layer IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| Model params (layer IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); - LOG_INF("\n"); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1m); + LOG_INF("\n"); - LOG_INF("| Model params (output F32) "); + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); - LOG_INF("| Model params (output F16) "); + LOG_INF("| Model params (output F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q2K) "); + LOG_INF("| Model params (output Q2K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); + LOG_INF("| Model params (output Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); - LOG_INF("| Model params (output Q5K) "); + LOG_INF("| Model params (output Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k); LOG_INF("\n"); - LOG_INF("| Model params (output Q6K) "); + LOG_INF("| Model params (output Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); + LOG_INF("| Model params (output IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q50) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); LOG_INF("\n"); - LOG_INF("| Model params (output Q80) "); + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); - LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| Model params (output IQ1S) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); LOG_INF("\n"); - LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| Model params (output IQ4NL) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); LOG_INF("\n"); - LOG_INF("| Model bytes (input) "); + LOG_INF("| Model params (output IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); - LOG_INF("| Model bytes (layer) "); + LOG_INF("| Model bytes (layer) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer); LOG_INF("\n"); - LOG_INF("| Model bytes (output) "); + LOG_INF("| Model bytes (output) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output); LOG_INF("\n"); @@ -2287,26 +2373,32 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 10 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, + + sizeof(float) * 12 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, // - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32 + // - cpu_props.flops_iq2xxs_f32 // - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32 // - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32 + // - cpu_props.flops_iq1m_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 26; // GPU attributes + + sizeof(float) * 30; // GPU attributes // memory: // - memory_free, memory_total // - metal_read_vram_bw, cuda_read_vram_bw // Metal floating-point performance: // - metal_flops_f32_f32, metal_flops_f16_f32 // - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32 + // - metal_flops_iq2xxs_f32 // - metal_flops_q50_f32, metal_flops_q80_f32 // - metal_flops_iq1s_f32, metal_flops_iq4nl_f32 + // - metal_flops_iq1m_f32 // CUDA floating-point performance: // - cuda_flops_f32_f32, cuda_flops_f16_f32 // - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32 + // - cuda_flops_iq2xxs_f32 // - cuda_flops_q50_f32, cuda_flops_q80_f32 // - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32 + // - cuda_flops_iq1m_f32 // delay: // - metal_mem_cpy_delay, cuda_mem_cpy_delay @@ -2379,6 +2471,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); ptr += sizeof(float); @@ -2391,6 +2486,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2424,6 +2522,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); ptr += sizeof(float); @@ -2436,6 +2537,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); ptr += sizeof(float); @@ -2460,6 +2564,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); ptr += sizeof(float); @@ -2472,6 +2579,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params @@ -2558,6 +2668,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -2570,6 +2683,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2603,6 +2719,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -2615,6 +2734,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); ptr += sizeof(float); @@ -2639,6 +2761,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -2651,6 +2776,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params diff --git a/common/profiler.h b/common/profiler.h index 0681a711..b8fff0d1 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -15,16 +15,18 @@ struct cpu_props { const char * name; const char * description; uint32_t cores; - float flops_f32_f32; // in GFLOPS - float flops_f16_f32; // in GFLOPS - float flops_q2k_f32; // in GFLOPS - float flops_q4k_f32; // in GFLOPS - float flops_q5k_f32; // in GFLOPS - float flops_q6k_f32; // in GFLOPS - float flops_q50_f32; // in GFLOPS - float flops_q80_f32; // in GFLOPS - float flops_iq1s_f32; // in GFLOPS - float flops_iq4nl_f32; // in GFLOPS + float flops_f32_f32; // in GFLOPS + float flops_f16_f32; // in GFLOPS + float flops_q2k_f32; // in GFLOPS + float flops_q4k_f32; // in GFLOPS + float flops_q5k_f32; // in GFLOPS + float flops_q6k_f32; // in GFLOPS + float flops_iq2xxs_f32; // in GFLOPS + float flops_q50_f32; // in GFLOPS + float flops_q80_f32; // in GFLOPS + float flops_iq1s_f32; // in GFLOPS + float flops_iq4nl_f32; // in GFLOPS + float flops_iq1m_f32; // in GFLOPS cpu_props() : name (""), @@ -36,10 +38,12 @@ struct cpu_props { flops_q4k_f32 (0.0f), flops_q5k_f32 (0.0f), flops_q6k_f32 (0.0f), + flops_iq2xxs_f32(0.0f), flops_q50_f32 (0.0f), flops_q80_f32 (0.0f), flops_iq1s_f32 (0.0f), - flops_iq4nl_f32 (0.0f) + flops_iq4nl_f32 (0.0f), + flops_iq1m_f32 (0.0f) {} }; @@ -84,32 +88,36 @@ struct gpu_support { struct gpu_props { const char * name; const char * description; - float memory_free; // in GiB - float memory_total; // in GiB - float metal_read_vram_bw; // in GB/s - float metal_flops_f32_f32; // in GFLOPS - float metal_flops_f16_f32; // in GFLOPS - float metal_flops_q2k_f32; // in GFLOPS - float metal_flops_q4k_f32; // in GFLOPS - float metal_flops_q5k_f32; // in GFLOPS - float metal_flops_q6k_f32; // in GFLOPS - float metal_flops_q50_f32; // in GFLOPS - float metal_flops_q80_f32; // in GFLOPS - float metal_flops_iq1s_f32; // in GFLOPS - float metal_flops_iq4nl_f32; // in GFLOPS - float metal_mem_cpy_delay; // in ms - float cuda_read_vram_bw; // in GB/s - float cuda_flops_f32_f32; // in GFLOPS - float cuda_flops_f16_f32; // in GFLOPS - float cuda_flops_q2k_f32; // in GFLOPS - float cuda_flops_q4k_f32; // in GFLOPS - float cuda_flops_q5k_f32; // in GFLOPS - float cuda_flops_q6k_f32; // in GFLOPS - float cuda_flops_q50_f32; // in GFLOPS - float cuda_flops_q80_f32; // in GFLOPS - float cuda_flops_iq1s_f32; // in GFLOPS - float cuda_flops_iq4nl_f32; // in GFLOPS - float cuda_mem_cpy_delay; // in ms + float memory_free; // in GiB + float memory_total; // in GiB + float metal_read_vram_bw; // in GB/s + float metal_flops_f32_f32; // in GFLOPS + float metal_flops_f16_f32; // in GFLOPS + float metal_flops_q2k_f32; // in GFLOPS + float metal_flops_q4k_f32; // in GFLOPS + float metal_flops_q5k_f32; // in GFLOPS + float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_iq2xxs_f32; // in GFLOPS + float metal_flops_q50_f32; // in GFLOPS + float metal_flops_q80_f32; // in GFLOPS + float metal_flops_iq1s_f32; // in GFLOPS + float metal_flops_iq4nl_f32; // in GFLOPS + float metal_flops_iq1m_f32; // in GFLOPS + float metal_mem_cpy_delay; // in ms + float cuda_read_vram_bw; // in GB/s + float cuda_flops_f32_f32; // in GFLOPS + float cuda_flops_f16_f32; // in GFLOPS + float cuda_flops_q2k_f32; // in GFLOPS + float cuda_flops_q4k_f32; // in GFLOPS + float cuda_flops_q5k_f32; // in GFLOPS + float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_iq2xxs_f32; // in GFLOPS + float cuda_flops_q50_f32; // in GFLOPS + float cuda_flops_q80_f32; // in GFLOPS + float cuda_flops_iq1s_f32; // in GFLOPS + float cuda_flops_iq4nl_f32; // in GFLOPS + float cuda_flops_iq1m_f32; // in GFLOPS + float cuda_mem_cpy_delay; // in ms gpu_props() : name (""), @@ -123,10 +131,12 @@ struct gpu_props { metal_flops_q4k_f32 (0.0f), metal_flops_q5k_f32 (0.0f), metal_flops_q6k_f32 (0.0f), + metal_flops_iq2xxs_f32 (0.0f), metal_flops_q50_f32 (0.0f), metal_flops_q80_f32 (0.0f), metal_flops_iq1s_f32 (0.0f), metal_flops_iq4nl_f32 (0.0f), + metal_flops_iq1m_f32 (0.0f), metal_mem_cpy_delay (0.0f), cuda_read_vram_bw (0.0f), cuda_flops_f32_f32 (0.0f), @@ -135,10 +145,12 @@ struct gpu_props { cuda_flops_q4k_f32 (0.0f), cuda_flops_q5k_f32 (0.0f), cuda_flops_q6k_f32 (0.0f), + cuda_flops_iq2xxs_f32 (0.0f), cuda_flops_q50_f32 (0.0f), cuda_flops_q80_f32 (0.0f), cuda_flops_iq1s_f32 (0.0f), cuda_flops_iq4nl_f32 (0.0f), + cuda_flops_iq1m_f32 (0.0f), cuda_mem_cpy_delay (0.0f) {} }; @@ -150,43 +162,52 @@ struct model_flops { int64_t output_q4k_f32; int64_t output_q5k_f32; int64_t output_q6k_f32; + int64_t output_iq2xxs_f32; int64_t output_q50_f32; int64_t output_q80_f32; int64_t output_iq1s_f32; int64_t output_iq4nl_f32; + int64_t output_iq1m_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; int64_t layer_q2k_f32; int64_t layer_q4k_f32; int64_t layer_q5k_f32; int64_t layer_q6k_f32; + int64_t layer_iq2xxs_f32; int64_t layer_q50_f32; int64_t layer_q80_f32; int64_t layer_iq1s_f32; int64_t layer_iq4nl_f32; + int64_t layer_iq1m_f32; model_flops() : - inp_embd_ms(0.0f), - output_f32_f32(0), - output_f16_f32(0), - output_q2k_f32(0), - output_q4k_f32(0), - output_q5k_f32(0), - output_q6k_f32(0), - output_q50_f32(0), - output_q80_f32(0), - output_iq1s_f32(0), - output_iq4nl_f32(0), - layer_f32_f32 (0), - layer_f16_f32 (0), - layer_q2k_f32 (0), - layer_q4k_f32 (0), - layer_q5k_f32 (0), - layer_q6k_f32 (0), - layer_q50_f32 (0), - layer_q80_f32 (0), - layer_iq1s_f32 (0), - layer_iq4nl_f32 (0) {} + inp_embd_ms (0.0f), + output_f32_f32 (0), + output_f16_f32 (0), + output_q2k_f32 (0), + output_q4k_f32 (0), + output_q5k_f32 (0), + output_q6k_f32 (0), + output_iq2xxs_f32 (0), + output_q50_f32 (0), + output_q80_f32 (0), + output_iq1s_f32 (0), + output_iq4nl_f32 (0), + output_iq1m_f32 (0), + layer_f32_f32 (0), + layer_f16_f32 (0), + layer_q2k_f32 (0), + layer_q4k_f32 (0), + layer_q5k_f32 (0), + layer_q6k_f32 (0), + layer_iq2xxs_f32 (0), + layer_q50_f32 (0), + layer_q80_f32 (0), + layer_iq1s_f32 (0), + layer_iq4nl_f32 (0), + layer_iq1m_f32 (0) + {} }; struct model_params { @@ -196,62 +217,75 @@ struct model_params { int64_t input_q4k; int64_t input_q5k; int64_t input_q6k; + int64_t input_iq2xxs; int64_t input_q50; int64_t input_q80; int64_t input_iq1s; int64_t input_iq4nl; + int64_t input_iq1m; int64_t output_f32; int64_t output_f16; int64_t output_q2k; int64_t output_q4k; int64_t output_q5k; int64_t output_q6k; + int64_t output_iq2xxs; int64_t output_q50; int64_t output_q80; int64_t output_iq1s; int64_t output_iq4nl; + int64_t output_iq1m; int64_t layer_f32; int64_t layer_f16; int64_t layer_q2k; int64_t layer_q4k; int64_t layer_q5k; int64_t layer_q6k; + int64_t layer_iq2xxs; int64_t layer_q50; int64_t layer_q80; int64_t layer_iq1s; int64_t layer_iq4nl; + int64_t layer_iq1m; model_params() : - input_f32 (0), - input_f16 (0), - input_q2k (0), - input_q4k (0), - input_q5k (0), - input_q6k (0), - input_q50 (0), - input_q80 (0), - input_iq1s(0), - input_iq4nl(0), - output_f32(0), - output_f16(0), - output_q2k(0), - output_q4k(0), - output_q5k(0), - output_q6k(0), - output_q50(0), - output_q80(0), - output_iq1s(0), - output_iq4nl(0), - layer_f32 (0), - layer_f16 (0), - layer_q2k (0), - layer_q4k (0), - layer_q5k (0), - layer_q6k (0), - layer_q50 (0), - layer_q80 (0), - layer_iq1s (0), - layer_iq4nl (0) {} + input_f32 (0), + input_f16 (0), + input_q2k (0), + input_q4k (0), + input_q5k (0), + input_q6k (0), + input_iq2xxs (0), + input_q50 (0), + input_q80 (0), + input_iq1s (0), + input_iq4nl (0), + input_iq1m (0), + output_f32 (0), + output_f16 (0), + output_q2k (0), + output_q4k (0), + output_q5k (0), + output_q6k (0), + output_iq2xxs (0), + output_q50 (0), + output_q80 (0), + output_iq1s (0), + output_iq4nl (0), + output_iq1m (0), + layer_f32 (0), + layer_f16 (0), + layer_q2k (0), + layer_q4k (0), + layer_q5k (0), + layer_q6k (0), + layer_iq2xxs (0), + layer_q50 (0), + layer_q80 (0), + layer_iq1s (0), + layer_iq4nl (0), + layer_iq1m (0) + {} }; struct model_bytes { diff --git a/src/llama.cpp b/src/llama.cpp index 2ac20007..95cbbaf9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3560,21 +3560,25 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_F16: return true; case GGML_TYPE_Q2_K: - return n_params->layer_q2k > 0 || n_params->output_q2k > 0; + return n_params->layer_q2k > 0 || n_params->output_q2k > 0; case GGML_TYPE_Q4_K: - return n_params->layer_q4k > 0 || n_params->output_q4k > 0; + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; case GGML_TYPE_Q5_K: - return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; case GGML_TYPE_Q6_K: - return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_IQ2_XXS: + return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0; case GGML_TYPE_Q5_0: - return n_params->layer_q50 > 0 || n_params->output_q50 > 0; + return n_params->layer_q50 > 0 || n_params->output_q50 > 0; case GGML_TYPE_Q8_0: - return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; case GGML_TYPE_IQ1_S: - return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; case GGML_TYPE_IQ4_NL: - return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; + return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; + case GGML_TYPE_IQ1_M: + return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0; default: throw std::runtime_error("Unrecognized data type\n"); } @@ -3679,6 +3683,12 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) { + dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + } + if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); @@ -3703,6 +3713,12 @@ void llama_profile_device( dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) { + dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + } } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -21049,34 +21065,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_flops->output_f32_f32 += n; + n_flops->output_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->output_f16_f32 += n; + n_flops->output_f16_f32 += n; break; case GGML_TYPE_Q2_K: - n_flops->output_q2k_f32 += n; + n_flops->output_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->output_q4k_f32 += n; + n_flops->output_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->output_q5k_f32 += n; + n_flops->output_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->output_q6k_f32 += n; + n_flops->output_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->output_iq2xxs_f32 += n; break; case GGML_TYPE_Q5_0: - n_flops->output_q50_f32 += n; + n_flops->output_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->output_q80_f32 += n; + n_flops->output_q80_f32 += n; break; case GGML_TYPE_IQ1_S: - n_flops->output_iq1s_f32 += n; + n_flops->output_iq1s_f32 += n; break; case GGML_TYPE_IQ4_NL: - n_flops->output_iq4nl_f32 += n; + n_flops->output_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->output_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21086,34 +21108,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case PROFILER_LAYER_BACKEND: switch (dtype) { case GGML_TYPE_F32: - n_flops->layer_f32_f32 += n; + n_flops->layer_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->layer_f16_f32 += n; + n_flops->layer_f16_f32 += n; break; case GGML_TYPE_Q2_K: - n_flops->layer_q2k_f32 += n; + n_flops->layer_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->layer_q4k_f32 += n; + n_flops->layer_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->layer_q5k_f32 += n; + n_flops->layer_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->layer_q6k_f32 += n; + n_flops->layer_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->layer_iq2xxs_f32 += n; break; case GGML_TYPE_Q5_0: - n_flops->layer_q50_f32 += n; + n_flops->layer_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->layer_q80_f32 += n; + n_flops->layer_q80_f32 += n; break; case GGML_TYPE_IQ1_S: - n_flops->layer_iq1s_f32 += n; + n_flops->layer_iq1s_f32 += n; break; case GGML_TYPE_IQ4_NL: - n_flops->layer_iq4nl_f32 += n; + n_flops->layer_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->layer_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21131,34 +21159,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_INPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->input_f32 += n_i64t; + n_params->input_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->input_f16 += n_i64t; + n_params->input_f16 += n_i64t; break; case GGML_TYPE_Q2_K: - n_params->input_q2k += n_i64t; + n_params->input_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->input_q4k += n_i64t; + n_params->input_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->input_q5k += n_i64t; + n_params->input_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->input_q6k += n_i64t; + n_params->input_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->input_iq2xxs += n_i64t; break; case GGML_TYPE_Q5_0: - n_params->input_q50 += n_i64t; + n_params->input_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->input_q80 += n_i64t; + n_params->input_q80 += n_i64t; break; case GGML_TYPE_IQ1_S: - n_params->input_iq1s += n_i64t; + n_params->input_iq1s += n_i64t; break; case GGML_TYPE_IQ4_NL: - n_params->input_iq4nl += n_i64t; + n_params->input_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->input_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21185,6 +21219,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_Q6_K: n_params->output_q6k += n_i64t; break; + case GGML_TYPE_IQ2_XXS: + n_params->output_iq2xxs += n_i64t; + break; case GGML_TYPE_Q5_0: n_params->output_q50 += n_i64t; break; @@ -21197,6 +21234,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_IQ4_NL: n_params->output_iq4nl += n_i64t; break; + case GGML_TYPE_IQ1_M: + n_params->output_iq1m += n_i64t; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } @@ -21222,6 +21262,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_Q6_K: n_params->layer_q6k += n_i64t; break; + case GGML_TYPE_IQ2_XXS: + n_params->layer_iq2xxs += n_i64t; + break; case GGML_TYPE_Q5_0: n_params->layer_q50 += n_i64t; break; @@ -21234,6 +21277,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_IQ4_NL: n_params->layer_iq4nl += n_i64t; break; + case GGML_TYPE_IQ1_M: + n_params->layer_iq1m += n_i64t; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); } @@ -21522,27 +21568,31 @@ void llama_model_n_flops( } // use average values instead of total values - n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); - n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); - n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); - n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); - n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); - n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); - n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); - n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); - n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); - n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); + n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); + n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); + n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); + n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); + n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); + n_flops->layer_iq2xxs_f32 = static_cast((double)n_flops->layer_iq2xxs_f32 / (double)n_layer); + n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); + n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); + n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); + n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + n_flops->layer_iq1m_f32 = static_cast((double)n_flops->layer_iq1m_f32 / (double)n_layer); n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); n_params->layer_q2k = static_cast((double)n_params->layer_q2k / (double)n_layer); n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); - n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); + n_params->layer_iq2xxs = static_cast((double)n_params->layer_iq2xxs / (double)n_layer); + n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); n_params->layer_iq1s = static_cast((double)n_params->layer_iq1s / (double)n_layer); n_params->layer_iq4nl = static_cast((double)n_params->layer_iq4nl / (double)n_layer); + n_params->layer_iq1m = static_cast((double)n_params->layer_iq1m / (double)n_layer); n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); From b212d74dc3c5f9e012e9d2689418dd8d31f4f982 Mon Sep 17 00:00:00 2001 From: leeetao <3122669219@qq.com> Date: Thu, 17 Apr 2025 09:17:11 +0000 Subject: [PATCH 07/31] update Readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a9eb733..12735e98 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ Here are the models we have tested so far. You can also try more on Hugging Face - **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF) - **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF) - **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF) -- **DeepSeek R1-70B (Q4K, Q6K, Q80):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) +- **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) ## ⚙️ How to Use? From 9cb87f7923df3b861f3ab0dd096db202f7cfc1ba Mon Sep 17 00:00:00 2001 From: DeEMO Date: Thu, 17 Apr 2025 13:44:23 +0000 Subject: [PATCH 08/31] add fio file to gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 406bebaf..87c0aa4f 100644 --- a/.gitignore +++ b/.gitignore @@ -135,4 +135,8 @@ poetry.toml /lora-tests # Video -*.mp4 \ No newline at end of file +*.mp4 + +# fio +fio_test* +*.fio \ No newline at end of file From 168c14f4e88ad31a81ac0c05f3faf88801c7bbda Mon Sep 17 00:00:00 2001 From: DeEMO Date: Thu, 17 Apr 2025 13:49:09 +0000 Subject: [PATCH 09/31] remove unnecessary profile when `--lw` is specified --- common/common.cpp | 43 ++++++++++++++++++++++++++++--------------- common/profiler.h | 4 ++++ include/llama.h | 1 + src/llama.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 15 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 5c972c90..55807f78 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1576,13 +1576,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { uint32_t my_rank = params.rank; bool auto_schedule = params.n_layer_window[0] == 0; - // get device profile - LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); - dev_info.rank = params.rank; - if (n_world > 1) { - llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); - } - // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); @@ -1599,16 +1592,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // initialize sockets llama_init_sockets(lctx, n_world, my_rank); + // broadcast startup args + struct startup_args args; + if (my_rank==0){ + args.should_profile = auto_schedule; + } + llama_bcast_startup_args(lctx, my_rank, &args); + + auto_schedule = args.should_profile; + // if n_world > 1 and need auto schdule, then prifile + if (auto_schedule){ + // get device profile + LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); + dev_info.rank = params.rank; + if (n_world > 1) { + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + } + } + // sychronize device profile to the master node - struct device_info * dev_info_set = nullptr; if (my_rank == 0) { - dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); - dev_info_set[0] = dev_info; - - llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model, cparams); - if (auto_schedule) { + struct device_info * dev_info_set = nullptr; + dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); + dev_info_set[0] = dev_info; + + llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world, model, cparams); + // automatically determine n_layer_window and n_gpu_layers if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); @@ -1623,7 +1634,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_bcast_layer_setup(lctx, n_layer_window, nullptr); } } else { - llama_send_device_info(lctx, &dev_info); + if (auto_schedule){ + llama_send_device_info(lctx, &dev_info); + } llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } diff --git a/common/profiler.h b/common/profiler.h index b8fff0d1..a685ff8c 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -312,6 +312,10 @@ struct disk_props { write_rnd_bw(0.0f) {} }; +struct startup_args{ + bool should_profile; +}; + struct device_info { uint32_t rank; const char * device_name; diff --git a/include/llama.h b/include/llama.h index 7d7392fe..9f3da708 100644 --- a/include/llama.h +++ b/include/llama.h @@ -453,6 +453,7 @@ extern "C" { LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); + LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); diff --git a/src/llama.cpp b/src/llama.cpp index 1aedb6a4..87ae83ac 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20262,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ return 0; } +LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) { + int32_t n_world = ctx->cparams.n_world; + if (n_world == 1) { + return 0; + } + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + if (rank==0){ + // send + try { + std::vector send_msgs; + send_msgs.emplace_back("should_profile", strlen("should_profile")); + send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile)); + zmq::send_multipart(*ctx->send_socket, send_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + }else { + // receive + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + GGML_ASSERT(recv_msgs[0].to_string() == "should_profile"); + GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); + bool should_profile = *static_cast(recv_msgs[1].data()); + args->should_profile = should_profile; + if (rank != n_world-1){ + // send + try { + zmq::send_multipart(*ctx->send_socket, recv_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + } + } + return 0; +} + int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) { uint32_t n_world = ctx->cparams.n_world; if (n_world == 1) { From cf4fa04980ee37a7c609dcdc52410895fe839dd4 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 18 Apr 2025 03:27:20 +0000 Subject: [PATCH 10/31] Add an independent profile tool --- .gitignore | 1 + Makefile | 7 ++++- tools/profile_tool.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 tools/profile_tool.cpp diff --git a/.gitignore b/.gitignore index 87c0aa4f..6f1e6062 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,7 @@ autogen-*.md /main /server +/profile-tool # CI diff --git a/Makefile b/Makefile index a3f1bf04..bcbaf01c 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = llama-cli +BUILD_TARGETS = llama-cli profile-tool # BUILD_TARGETS = \ # libllava.a \ # llama-baby-llama \ @@ -1528,6 +1528,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual +profile-tool: tools/profile_tool.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift (cd examples/batched.swift; make build) diff --git a/tools/profile_tool.cpp b/tools/profile_tool.cpp new file mode 100644 index 00000000..328df2e3 --- /dev/null +++ b/tools/profile_tool.cpp @@ -0,0 +1,62 @@ +#include "arg.h" +#include "common.h" +#include "console.h" +#include "log.h" +#include "llama.h" + +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); +} + +int main(int argc, char ** argv) { + gpt_params params; + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + return 1; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + // load the model and apply lora adapter, if any + auto mparams = llama_model_params_from_gpt_params(params); + struct llama_context_params cparams = llama_context_params_from_gpt_params(params); + + struct llama_model * model = nullptr; + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } else if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } + + if (model == NULL) { + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); + return -1; + } + + llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams); + + device_info dev_info; + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + device_print_props(&dev_info, 1, model, cparams); + + llama_free_model(model); + return 0; +} \ No newline at end of file From fa1444ab3b6ca7f39abd75263318c40dc91d7ea8 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Mon, 21 Apr 2025 23:40:40 +0900 Subject: [PATCH 11/31] chore: update debug-test.sh compliation -> compilation --- scripts/debug-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 91946c51..bc46e085 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" pushd "$build_dir" tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) if [ ${#tests[@]} -eq 0 ]; then - abort "No tests avaliable... check your compliation process..." + abort "No tests avaliable... check your compilation process..." fi popd > /dev/null || exit 1 From e2de4511c5c22b32d0c88ff19680e5888489bb94 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Sun, 11 May 2025 18:15:39 +0800 Subject: [PATCH 12/31] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a9eb733..71558172 100644 --- a/README.md +++ b/README.md @@ -326,7 +326,7 @@ If you find this work helpful, please do not hesitate to cite us and send a star ```bibtex @misc{li2025primacpp, title={PRIMA.CPP: Speeding Up 70B-Scale LLM Inference on Low-Resource Everyday Home Clusters}, - author={Zonghang Li and Tao Li and Wenjiao Feng and Mohsen Guizani and Hongfang Yu}, + author={Zonghang Li and Tao Li and Wenjiao Feng and Mohsen Guizani and Hongfang Yu and Qirong Ho and Wei Xiang}, year={2025}, eprint={2504.08791}, archivePrefix={arXiv}, From 2fbc0c8da39ec1df35fa011c75da4d891681c8ef Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 14 May 2025 13:27:20 +0400 Subject: [PATCH 13/31] fix: reset -ngl to 0 when GPU is not used and reformat code --- common/common.cpp | 9 ++++++++- common/profiler.cpp | 1 - include/llama.h | 2 +- src/llama.cpp | 6 +++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 55807f78..fd02664d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1527,6 +1527,12 @@ static bool assign_layers_to_device( // struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + +#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)) + // reset n_gpu_layers to 0 if GPU is not used + params.n_gpu_layers = 0; +#endif + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); @@ -1582,6 +1588,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (n_world == 1) { uint32_t n_layers = llama_model_n_layers(model); + // assign all layers to this device params.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers; mparams.n_layer_window[0] = n_layers; @@ -1594,7 +1601,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // broadcast startup args struct startup_args args; - if (my_rank==0){ + if (my_rank == 0){ args.should_profile = auto_schedule; } llama_bcast_startup_args(lctx, my_rank, &args); diff --git a/common/profiler.cpp b/common/profiler.cpp index 18b345a9..461cc3b9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -350,7 +350,6 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in return 0.0f; } - size_t QK_K = 0; switch (src0t) { case GGML_TYPE_F32: { matrix_B = malloc(embd_size * sizeof(float)); diff --git a/include/llama.h b/include/llama.h index 9f3da708..fd4fec40 100644 --- a/include/llama.h +++ b/include/llama.h @@ -453,7 +453,7 @@ extern "C" { LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); - LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); + LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); diff --git a/src/llama.cpp b/src/llama.cpp index 87ae83ac..718327e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20262,13 +20262,13 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ return 0; } -LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) { +int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) { int32_t n_world = ctx->cparams.n_world; if (n_world == 1) { return 0; } GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); - if (rank==0){ + if (rank == 0){ // send try { std::vector send_msgs; @@ -20289,7 +20289,7 @@ LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startu GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); bool should_profile = *static_cast(recv_msgs[1].data()); args->should_profile = should_profile; - if (rank != n_world-1){ + if ((int)rank != (int)n_world - 1){ // send try { zmq::send_multipart(*ctx->send_socket, recv_msgs); From 258fb2d06b95676b3722b143e90fef1d843895cc Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 14 May 2025 14:19:20 +0400 Subject: [PATCH 14/31] add QA: How to manually profile a device --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 12735e98..e9f8a3bb 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,15 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe > Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU. -**2. How to run in chat mode like in llama.cpp?** +**2. How to manually profile my device?** + +If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device. + +```shell +./profile-tool -m download/qwq-32b-q4_k_m.gguf +``` + +**3. How to run in chat mode like in llama.cpp?** To enable chat (conversation) mode, simply add the `-cnv` flag on the head device: @@ -298,7 +306,7 @@ To enable chat (conversation) mode, simply add the `-cnv` flag on the head devic To quit the chat mode, input `quit` or `exit`. -**3. How to force prefetching after computing?** +**4. How to force prefetching after computing?** By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device: @@ -309,11 +317,11 @@ By default, prima.cpp only advises the OS to prefetch upcoming layer weights. Th This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior. -**4. Does it support Windows?** +**5. Does it support Windows?** Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster. -**5. Does it support Vulkan or AMD GPUs?** +**6. Does it support Vulkan or AMD GPUs?** Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device. From 2cc01483fd24e67ec67a1c9590c068c109922c0e Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 14 May 2025 18:28:46 +0400 Subject: [PATCH 15/31] support server mode --- Makefile | 6 +++++- README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bcbaf01c..1cd1c2d9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = llama-cli profile-tool +BUILD_TARGETS = \ + llama-server \ + llama-cli \ + profile-tool + # BUILD_TARGETS = \ # libllava.a \ # llama-baby-llama \ diff --git a/README.md b/README.md index 5e40cc2e..42ee383f 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,35 @@ cd /root/prima.cpp > If your host machine does not have a GPU, ignore the `--gpu-mem` option. +### Run in Server Mode +You can run prima.cpp in server mode, by launching `llama-server` on the rank 0 device (with `--host` and `--port` specified) and `llama-cli` on the others. Here is an example with 2 devices: + +```shell +# On rank 0, run: +./llama-server -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch --host 127.0.0.1 --port 8080 + +# On rank 1, run: +./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 1 --master 192.168.1.2 --next 192.168.1.2 --prefetch +``` + +After that, you can interact with the rank 0 device by calling the Chat Completion API: + +```shell +curl http://127.0.0.1:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwq-32b", + "messages": [ + {"role": "user", "content": "what is edge AI?"} + ], + "max_tokens": 200, + "temperature": 0.7, + "stream": true + }' +``` + +You can also use third-party GUI clients like [AnythingLLM](https://anythingllm.com/) and set the API endpoint from prima.cpp, by default, `http://localhost:8080/v1`. + ## ❓ FAQ **1. How can I manually set the workload for each device?** From 07c4966a8023fcbfd42bb5c3488b9c504d58d2b3 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 14 May 2025 21:26:01 +0400 Subject: [PATCH 16/31] reduce fio data size to 1gb to speed up profiling --- common/profiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 461cc3b9..b842071c 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -913,7 +913,7 @@ ioengine=%s direct=1 time_based=1 runtime=1 -size=4G +size=1G group_reporting=1 iodepth=1 From 26bb86c09bc19e5e17549294ffddb5620b133f27 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Wed, 14 May 2025 07:50:04 +0000 Subject: [PATCH 17/31] Add tune_layer_allocation Signed-off-by: DeEMO --- common/common.cpp | 64 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 55807f78..021f1640 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1522,6 +1522,61 @@ static bool assign_layers_to_device( return true; } +static bool tune_layer_allocation( + uint32_t n_world, + uint32_t my_rank, + std::vector dev_infos, + uint32_t * n_layer_window, + uint32_t * n_gpu_layers, + struct llama_model * model, + const struct llama_context_params cparams, + float min_disk_read_speed = 0.1f) { + memset(n_layer_window, 0, n_world * sizeof(uint32_t)); + memset(n_gpu_layers, 0, n_world * sizeof(uint32_t)); + std::vector dev_infos_temp = dev_infos; + std::vector n_layer_windows_temp; + std::vector n_gpu_layers_temp; + while(n_world > 0) { + std::vector dev_infos_ = dev_infos_temp; + std::vector n_layer_windows_(n_world, 0); + std::vector n_gpu_layers_(n_world, 0); + if (!assign_layers_to_device(n_world, my_rank, dev_infos_.data(), + n_layer_windows_.data(), n_gpu_layers_.data(), model, cparams)) { + return false; + } + dev_infos_temp.clear(); + n_layer_windows_temp.clear(); + n_gpu_layers_temp.clear(); + for(auto i=0; i 1 || i==0 ) { + dev_infos_temp.push_back(dev_infos_[i]); + n_layer_windows_temp.push_back(n_layer_windows_[i]); + n_gpu_layers_temp.push_back(n_gpu_layers_[i]); + } + } + if(dev_infos_temp.size() == n_world) { + // no device be removed + break; + } + + n_world = dev_infos_temp.size(); + } + int i =0 , j =0; + while(j < n_world) { + if(dev_infos[i].rank == dev_infos_temp[j].rank){ + n_layer_window[i] = n_layer_windows_temp[j]; + n_gpu_layers[i] = n_gpu_layers_temp[j]; + j++; + i++; + } else { + n_layer_window[i] = 0; + n_gpu_layers[i] = 0; + i++; + } + } + return true; +} + // // Model utils // @@ -1613,15 +1668,14 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // sychronize device profile to the master node if (my_rank == 0) { if (auto_schedule) { - struct device_info * dev_info_set = nullptr; - dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); + std::vector dev_info_set(n_world); dev_info_set[0] = dev_info; - llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model, cparams); + llama_gather_device_info(lctx, dev_info_set.data()); + device_print_props(dev_info_set.data(), n_world, model, cparams); // automatically determine n_layer_window and n_gpu_layers - if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { + if (!tune_layer_allocation(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); llama_free(lctx); llama_free_model(model); From fdd669463398483b81def2c7fbd723e65ed5a087 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Thu, 15 May 2025 04:22:12 +0000 Subject: [PATCH 18/31] add topo rebuild Signed-off-by: DeEMO --- common/common.cpp | 10 +++++- common/profiler.h | 2 ++ include/llama.h | 1 + src/llama.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 021f1640..991b34a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1660,6 +1660,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // get device profile LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); dev_info.rank = params.rank; + dev_info.next_ip = params.next_node_ip.c_str(); if (n_world > 1) { llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); } @@ -1682,6 +1683,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { return iparams; } llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); + + //rebuild topo + llama_rebuild_topo(lctx, n_layer_window, dev_info_set.data()); } else { // use the user-defined n_layer_window std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window); @@ -1690,8 +1694,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } else { if (auto_schedule){ llama_send_device_info(lctx, &dev_info); + llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); + // rebuild topo + llama_rebuild_topo(lctx,n_layer_window, nullptr); + }else{ + llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } - llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } // update n_layer_window and n_gpu_layers diff --git a/common/profiler.h b/common/profiler.h index a685ff8c..a3110299 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -320,6 +320,7 @@ struct device_info { uint32_t rank; const char * device_name; const char * device_os; + const char * next_ip; struct disk_props disk; struct cpu_props cpu_props; struct memory_info memory; @@ -333,6 +334,7 @@ struct device_info { rank(0), device_name(""), device_os(""), + next_ip(""), disk(), cpu_props(), memory(), diff --git a/include/llama.h b/include/llama.h index 9f3da708..515dfd93 100644 --- a/include/llama.h +++ b/include/llama.h @@ -455,6 +455,7 @@ extern "C" { LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); + LLAMA_API int llama_rebuild_topo (struct llama_context * ctx, uint32_t * n_layer_window, struct device_info * dev_info_set); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llm_load_tensors( diff --git a/src/llama.cpp b/src/llama.cpp index 87ae83ac..8cc37213 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20329,6 +20329,92 @@ int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_windo return 0; } +LLAMA_API int llama_rebuild_topo(llama_context *ctx, + uint32_t *n_layer_window, + device_info *dev_info_set) { + uint32_t n_world = ctx->cparams.n_world; + uint32_t my_rank = ctx->cparams.rank; + std::vector msgs; + device_info* dev_info_ptr = nullptr; + if (dev_info_set == nullptr){ + // for rank!=0, recv all devices info + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(msgs))) { + return -1; + } + dev_info_ptr = new device_info[n_world]; + for (size_t i = 0; i < msgs.size(); i++) { + deserialize((const char *)msgs[i].data(), &dev_info_set[i]); + } + }else{ + char * buffer = nullptr; + for(size_t i = 0; i < n_world; i++) { + size_t buffer_size = serialize(&dev_info_set[i], &buffer); + msgs.emplace_back(buffer, buffer_size); + + free(buffer); + } + dev_info_ptr = dev_info_set; + } + + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + GGML_ASSERT(msgs.size() == n_world); + + // notify next rank + auto next_rank = (my_rank + 1) % n_world; + if(n_layer_window[next_rank] <= 0){ + try { + ctx->send_socket->setsockopt(ZMQ_LINGER, 3500); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + if(!dev_info_set){ + delete[] dev_info_ptr; + } + return -1; + } + } + + // check myself's layer + auto* socket_to_close = ctx->send_socket; + if(n_layer_window[my_rank] > 0) { + // reconstruct socket to the next valid rank + std::string next_ip; + auto current_rank = my_rank; + while(next_rank!=my_rank){ + if(n_layer_window[next_rank] > 0){ + next_ip = dev_info_ptr[next_rank].next_ip; + break; + } + next_rank = (next_rank + 1) % n_world; + current_rank = (current_rank + 1) % n_world; + } + if(!next_ip.empty()){ + ctx->send_socket = new zmq::socket_t(*ctx->sock_context, zmq::socket_type::push); + std::string send_endp = "tcp://" + next_ip + ":" + std::to_string(map_rank_to_port(next_rank, ctx->data_port)); + ctx->next_node_ip = next_ip; + try { + ctx->send_socket->connect(send_endp); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t &e) { + LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what()); + if(!dev_info_set){ + delete[] dev_info_ptr; + } + return -1; + } + } + } + if(!dev_info_set){ + delete[] dev_info_ptr; + } + socket_to_close->close(); + delete socket_to_close; + if(n_layer_window[my_rank]<=0){ + exit(0); + } + return true; +} + int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) { uint32_t n_world = ctx->cparams.n_world; uint32_t my_rank = ctx->cparams.rank; From cc46aa9828f3ff3223d895cd92d124cdccc06dc3 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Thu, 15 May 2025 13:57:16 +0800 Subject: [PATCH 19/31] update rank and n_world Signed-off-by: DeEMO --- common/common.cpp | 24 ++++++++++++++++++++++++ include/llama.h | 6 +++++- src/llama.cpp | 9 +++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 991b34a5..88b00075 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1702,6 +1702,30 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } } + //update rank and n_world for consistency + uint32_t update_rank = 0; + uint32_t update_n_world = 1; + std::vector n_layer_window_temp = {n_layer_window[0]}; + std::vector n_gpu_layers_temp = {n_gpu_layers[0]}; + for(auto i=1; icparams.rank = rank; + ctx->cparams.n_world = n_world; + } +} + struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params) { From 4b36aef157e5234b3f971c9523214595859c3dec Mon Sep 17 00:00:00 2001 From: DeEMO Date: Thu, 15 May 2025 06:25:12 +0000 Subject: [PATCH 20/31] fix some bugs Signed-off-by: DeEMO --- common/common.cpp | 14 ++++++++++---- src/llama.cpp | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 88b00075..a98337d3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1547,7 +1547,7 @@ static bool tune_layer_allocation( dev_infos_temp.clear(); n_layer_windows_temp.clear(); n_gpu_layers_temp.clear(); - for(auto i=0; i 1 || i==0 ) { dev_infos_temp.push_back(dev_infos_[i]); n_layer_windows_temp.push_back(n_layer_windows_[i]); @@ -1561,7 +1561,7 @@ static bool tune_layer_allocation( n_world = dev_infos_temp.size(); } - int i =0 , j =0; + uint32_t i =0 , j =0; while(j < n_world) { if(dev_infos[i].rank == dev_infos_temp[j].rank){ n_layer_window[i] = n_layer_windows_temp[j]; @@ -1701,13 +1701,19 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } } + if(n_layer_window[my_rank]<=0){ + LOG_INF("%s: info: rank %d has no layers to run, skipping\n", __func__, my_rank); + llama_free(lctx); + llama_free_model(model); + exit(0); + } //update rank and n_world for consistency uint32_t update_rank = 0; uint32_t update_n_world = 1; std::vector n_layer_window_temp = {n_layer_window[0]}; std::vector n_gpu_layers_temp = {n_gpu_layers[0]}; - for(auto i=1; icparams.n_world; + auto n_world = ctx->cparams.n_world; if (n_world == 1) { return 0; } @@ -20343,14 +20343,14 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } dev_info_ptr = new device_info[n_world]; for (size_t i = 0; i < msgs.size(); i++) { - deserialize((const char *)msgs[i].data(), &dev_info_set[i]); + deserialize((const char *)msgs[i].data(), &dev_info_ptr[i]); } }else{ char * buffer = nullptr; for(size_t i = 0; i < n_world; i++) { size_t buffer_size = serialize(&dev_info_set[i], &buffer); msgs.emplace_back(buffer, buffer_size); - + free(buffer); } dev_info_ptr = dev_info_set; @@ -20361,9 +20361,9 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, // notify next rank auto next_rank = (my_rank + 1) % n_world; - if(n_layer_window[next_rank] <= 0){ + if(n_layer_window[next_rank] <= 0 && next_rank != 0){ try { - ctx->send_socket->setsockopt(ZMQ_LINGER, 3500); + ctx->send_socket->set(zmq::sockopt::linger, 3500); zmq::send_multipart(*ctx->send_socket, msgs); } catch (const zmq::error_t& e) { LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); @@ -20382,7 +20382,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, auto current_rank = my_rank; while(next_rank!=my_rank){ if(n_layer_window[next_rank] > 0){ - next_ip = dev_info_ptr[next_rank].next_ip; + next_ip = dev_info_ptr[current_rank].next_ip; break; } next_rank = (next_rank + 1) % n_world; @@ -20402,6 +20402,9 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } return -1; } + }else{ + // only one node + ctx->next_node_ip = ""; } } if(!dev_info_set){ @@ -20409,10 +20412,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } socket_to_close->close(); delete socket_to_close; - if(n_layer_window[my_rank]<=0){ - exit(0); - } - return true; + return 0; } int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) { From 0ad009a2f45363579219c3c21b24c191ea3a4e0f Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 16 May 2025 15:26:16 +0800 Subject: [PATCH 21/31] fix: update serialization and deserialization for next_ip in device_info Signed-off-by: DeEMO --- common/profiler.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 18b345a9..380bc5b0 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -2358,15 +2358,17 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { // calculate total size for serialized buffer size_t device_name_len = strlen(dev_info->device_name) + 1; size_t device_os_len = strlen(dev_info->device_os) + 1; + size_t next_ip_len = strlen(dev_info->next_ip) + 1; size_t cpu_name_len = strlen(dev_info->cpu_props.name) + 1; size_t cpu_description_len = strlen(dev_info->cpu_props.description) + 1; size_t gpu_name_len = strlen(dev_info->gpu_props.name) + 1; size_t gpu_description_len = strlen(dev_info->gpu_props.description) + 1; size_t total_size = sizeof(uint32_t) - + sizeof(size_t) * 6 // for lengths of strings + + sizeof(size_t) * 7 // for lengths of strings + device_name_len + device_os_len + + next_ip_len + cpu_name_len + cpu_description_len + gpu_name_len @@ -2426,6 +2428,11 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, dev_info->device_os, device_os_len); ptr += device_os_len; + memcpy(ptr, &next_ip_len, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, dev_info->next_ip, next_ip_len); + ptr += next_ip_len; + memcpy(ptr, &cpu_name_len, sizeof(size_t)); ptr += sizeof(size_t); memcpy(ptr, dev_info->cpu_props.name, cpu_name_len); @@ -2611,6 +2618,14 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(const_cast(static_cast(dev_info->device_os)), ptr, device_os_len); ptr += device_os_len; + // next ip + size_t next_ip_len; + memcpy(&next_ip_len, ptr, sizeof(size_t)); + ptr += sizeof(size_t); + dev_info->next_ip = (char *)malloc(next_ip_len); + memcpy(const_cast(static_cast(dev_info->next_ip)), ptr, next_ip_len); + ptr += next_ip_len; + // cpu_props.name size_t cpu_name_len; memcpy(&cpu_name_len, ptr, sizeof(size_t)); From df16b1876f316958dbef6e3843f63e1888cb4725 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 16 May 2025 16:02:25 +0800 Subject: [PATCH 22/31] refactor: add zmq helper to generate message Signed-off-by: DeEMO --- src/llama.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5c640b90..a0b91edd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -161,6 +161,19 @@ static void zeros(std::ofstream & file, size_t n) { } } +// zmq helpers +static std::vector dev_infos_to_messages(const device_info* infos, + uint32_t n_world){ + std::vector res; + for (uint32_t i = 0; i < n_world; ++i) { + char * buffer = nullptr; + size_t buffer_size = serialize(&infos[i], &buffer); + res.emplace_back(buffer, buffer_size); + free(buffer); + } + return res; +} + LLAMA_ATTRIBUTE_FORMAT(1, 2) static std::string format(const char * fmt, ...) { va_list ap; @@ -20334,10 +20347,10 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, device_info *dev_info_set) { uint32_t n_world = ctx->cparams.n_world; uint32_t my_rank = ctx->cparams.rank; - std::vector msgs; device_info* dev_info_ptr = nullptr; if (dev_info_set == nullptr){ // for rank!=0, recv all devices info + std::vector msgs; if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(msgs))) { return -1; } @@ -20345,24 +20358,18 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, for (size_t i = 0; i < msgs.size(); i++) { deserialize((const char *)msgs[i].data(), &dev_info_ptr[i]); } + GGML_ASSERT(msgs.size() == n_world); }else{ - char * buffer = nullptr; - for(size_t i = 0; i < n_world; i++) { - size_t buffer_size = serialize(&dev_info_set[i], &buffer); - msgs.emplace_back(buffer, buffer_size); - - free(buffer); - } dev_info_ptr = dev_info_set; } GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); - GGML_ASSERT(msgs.size() == n_world); // notify next rank auto next_rank = (my_rank + 1) % n_world; if(n_layer_window[next_rank] <= 0 && next_rank != 0){ try { + auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); ctx->send_socket->set(zmq::sockopt::linger, 3500); zmq::send_multipart(*ctx->send_socket, msgs); } catch (const zmq::error_t& e) { @@ -20394,6 +20401,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, ctx->next_node_ip = next_ip; try { ctx->send_socket->connect(send_endp); + auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); zmq::send_multipart(*ctx->send_socket, msgs); } catch (const zmq::error_t &e) { LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what()); @@ -20477,7 +20485,7 @@ void llama_free_sockets(struct llama_context * ctx, char ** msg) { void llama_update_context_with_rankworld(struct llama_context * ctx, uint32_t rank, - uint32_t n_world) { + uint32_t n_world) { if(ctx) { ctx->cparams.rank = rank; ctx->cparams.n_world = n_world; From 8b61cb2fa4b30952bc303a587dc1d77bfc8af5dd Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 16 May 2025 17:03:36 +0800 Subject: [PATCH 23/31] fix: adapt the new topo Signed-off-by: DeEMO --- common/common.cpp | 8 ++++++++ examples/main/main.cpp | 7 +++++-- src/llama.cpp | 6 +++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a98337d3..35d285c6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1731,6 +1731,14 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { n_gpu_layers[i] = n_gpu_layers_temp[i]; } llama_update_context_with_rankworld(lctx, update_rank, update_n_world); + cparams.rank = update_rank; + cparams.n_world = update_n_world; + mparams.rank = update_rank; + mparams.n_world = update_n_world; + params.rank = update_rank; + params.n_world = update_n_world; + my_rank = update_rank; + n_world = update_n_world; // update n_layer_window and n_gpu_layers std::copy(std::begin(n_layer_window), std::end(n_layer_window), params.n_layer_window); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 39d4b60c..04680373 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -143,8 +143,8 @@ int main(int argc, char ** argv) { return 1; } - const uint32_t n_world = params.n_world; - const uint32_t my_rank = params.rank; + uint32_t n_world = params.n_world; + uint32_t my_rank = params.rank; GGML_ASSERT(!(n_world == 1 && my_rank > 0)); // check if --n-layer-window and --world is matched @@ -200,6 +200,9 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); + // update + my_rank = params.rank; + n_world = params.n_world; model = llama_init.model; ctx = llama_init.context; diff --git a/src/llama.cpp b/src/llama.cpp index a0b91edd..f083e8e5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2585,6 +2585,7 @@ static_assert(std::is_trivially_copyable::value, "llama_hparams m struct llama_cparams { uint32_t n_world; uint32_t rank; + uint32_t original_next_rank; // original rank of the next node uint32_t n_layer_window[32]; bool prefetch; bool force; @@ -20399,6 +20400,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, ctx->send_socket = new zmq::socket_t(*ctx->sock_context, zmq::socket_type::push); std::string send_endp = "tcp://" + next_ip + ":" + std::to_string(map_rank_to_port(next_rank, ctx->data_port)); ctx->next_node_ip = next_ip; + ctx->cparams.original_next_rank = next_rank; try { ctx->send_socket->connect(send_endp); auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); @@ -20457,7 +20459,8 @@ int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window void llama_free_sockets(struct llama_context * ctx, char ** msg) { const uint32_t n_world = ctx->cparams.n_world; const uint32_t my_rank = ctx->cparams.rank; - const uint32_t next_rank = (my_rank + 1) % n_world; + // to adapt to the new topology, use old next_rank + const uint32_t next_rank = ctx->cparams.original_next_rank; if (n_world == 1) { return; @@ -20508,6 +20511,7 @@ struct llama_context * llama_new_context_with_model( ctx->cparams.n_world = params.n_world; ctx->cparams.rank = params.rank; ctx->cparams.force = params.force; + ctx->cparams.original_next_rank = (params.rank + 1) % params.n_world; return ctx; } From 34eaa8224d18952c92a81ebaf341ba26c30347fa Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 16 May 2025 20:48:51 +0800 Subject: [PATCH 24/31] fix: handle socket closure and connection in llama_rebuild_topo Signed-off-by: DeEMO --- src/llama.cpp | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f083e8e5..121a00b6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20383,7 +20383,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } // check myself's layer - auto* socket_to_close = ctx->send_socket; + zmq::socket_t* socket_to_close = nullptr; if(n_layer_window[my_rank] > 0) { // reconstruct socket to the next valid rank std::string next_ip; @@ -20397,20 +20397,25 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, current_rank = (current_rank + 1) % n_world; } if(!next_ip.empty()){ - ctx->send_socket = new zmq::socket_t(*ctx->sock_context, zmq::socket_type::push); - std::string send_endp = "tcp://" + next_ip + ":" + std::to_string(map_rank_to_port(next_rank, ctx->data_port)); - ctx->next_node_ip = next_ip; - ctx->cparams.original_next_rank = next_rank; - try { + if((my_rank+1)%n_world != next_rank){ + socket_to_close = ctx->send_socket; + ctx->send_socket = new zmq::socket_t(*ctx->sock_context, zmq::socket_type::push); + std::string send_endp = "tcp://" + next_ip + ":" + std::to_string(map_rank_to_port(next_rank, ctx->data_port)); ctx->send_socket->connect(send_endp); - auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); - zmq::send_multipart(*ctx->send_socket, msgs); - } catch (const zmq::error_t &e) { - LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what()); - if(!dev_info_set){ - delete[] dev_info_ptr; + ctx->next_node_ip = next_ip; + ctx->cparams.original_next_rank = next_rank; + } + if(next_rank != 0){ + try { + auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t &e) { + LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what()); + if(!dev_info_set){ + delete[] dev_info_ptr; + } + return -1; } - return -1; } }else{ // only one node @@ -20420,8 +20425,10 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, if(!dev_info_set){ delete[] dev_info_ptr; } - socket_to_close->close(); - delete socket_to_close; + if(socket_to_close != nullptr){ + socket_to_close->close(); + delete socket_to_close; + } return 0; } From c54a6a0132cc7a7a303ba357d6c7788e4bdc508b Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Mon, 19 May 2025 16:58:35 +0400 Subject: [PATCH 25/31] fix context shifting --- Makefile | 2 +- README.md | 24 +-- common/arg.cpp | 39 +++-- common/common.cpp | 22 ++- common/profiler.h | 3 +- examples/server/server.cpp | 49 ++++--- include/llama.h | 39 ++++- src/llama.cpp | 292 ++++++++++++++++++++++++++++++++++--- 8 files changed, 397 insertions(+), 73 deletions(-) diff --git a/Makefile b/Makefile index 1cd1c2d9..60cfc22f 100644 --- a/Makefile +++ b/Makefile @@ -272,7 +272,7 @@ MK_LDFLAGS += -L/usr/local/lib -lzmq ifeq ($(UNAME_S),Darwin) MK_CPPFLAGS += -isystem /opt/homebrew/include - MK_LDFLAGS += -L/opt/homebrew/lib -lzmq + MK_LDFLAGS += -L/opt/homebrew/lib endif ifeq ($(USE_HIGHS),1) diff --git a/README.md b/README.md index 42ee383f..17661e2e 100644 --- a/README.md +++ b/README.md @@ -203,17 +203,17 @@ graph LR; Take QwQ-32B as an example, run the following commands on the devices to launch distributed inference: ```shell -# on head device without a GPU, rank 0: +# On head device without a GPU, rank 0: ./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch -# on worker device with 8 GiB VRAM, rank 1: -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8 +# On worker device with 8 GiB VRAM, rank 1: +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8 -# on worker device with 11 GiB VRAM, rank 2: -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11 +# On worker device with 11 GiB VRAM, rank 2: +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11 -# on worker device without a GPU, rank 3: -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch +# On worker device without a GPU, rank 3: +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch ``` Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available). @@ -262,6 +262,8 @@ cd /root/prima.cpp > If your host machine does not have a GPU, ignore the `--gpu-mem` option. +> If you update to the latest code, non-rank 0 nodes can omit `-c 1024`. + ### Run in Server Mode You can run prima.cpp in server mode, by launching `llama-server` on the rank 0 device (with `--host` and `--port` specified) and `llama-cli` on the others. Here is an example with 2 devices: @@ -270,7 +272,7 @@ You can run prima.cpp in server mode, by launching `llama-server` on the rank 0 ./llama-server -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch --host 127.0.0.1 --port 8080 # On rank 1, run: -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 1 --master 192.168.1.2 --next 192.168.1.2 --prefetch +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 2 --rank 1 --master 192.168.1.2 --next 192.168.1.2 --prefetch ``` After that, you can interact with the rank 0 device by calling the Chat Completion API: @@ -302,13 +304,13 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe ./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch -lw "16,16,16,16" # on worker device with 8 GiB VRAM, rank 1, use the option "-ngl": -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch -ngl 16 +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch -ngl 16 # on worker device with 11 GiB VRAM, rank 2, use the option "-ngl": -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch -ngl 16 +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch -ngl 16 # on worker device without a GPU, rank 3: -./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch +./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch ``` - `-lw` sets the total model layers each device should handle. The format is a comma-separated list, one value per device, in rank order. You can also set `"8,8,8,8"`, `"4,4,4,4"`, `"16,16,24,8"`. diff --git a/common/arg.cpp b/common/arg.cpp index d53a09f8..3dcaa051 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -986,13 +986,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"--no-warmup"}, - "skip warming up the model with an empty run", - [](gpt_params & params) { - params.warmup = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + // add_opt(llama_arg( + // {"--no-warmup"}, + // "skip warming up the model with an empty run", + // [](gpt_params & params) { + // params.warmup = false; + // } + // ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--spm-infill"}, format( @@ -1317,6 +1317,12 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-ctk", "--cache-type-k"}, "TYPE", format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), [](gpt_params & params, const std::string & value) { + +#ifdef GGML_USE_METAL + LOG_WRN("The option -ctk or --cache-type-k is not supported on Metal, use default type\n"); + return; +#endif + // TODO: get the type right here params.cache_type_k = value; } @@ -1325,6 +1331,11 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-ctv", "--cache-type-v"}, "TYPE", format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), [](gpt_params & params, const std::string & value) { +#ifdef GGML_USE_METAL + LOG_WRN("The option -ctv or --cache-type-v is not supported on Metal, use default type\n"); + return; +#endif + // TODO: get the type right here params.cache_type_v = value; } @@ -1413,13 +1424,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(llama_arg( - {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { - params.n_parallel = value; - } - ).set_env("LLAMA_ARG_N_PARALLEL")); + // add_opt(llama_arg( + // {"-np", "--parallel"}, "N", + // format("number of parallel sequences to decode (default: %d)", params.n_parallel), + // [](gpt_params & params, int value) { + // params.n_parallel = value; + // } + // ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), diff --git a/common/common.cpp b/common/common.cpp index fd02664d..89707fb7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1593,6 +1593,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { cparams.n_layer_window[0] = n_layers; mparams.n_layer_window[0] = n_layers; llama_context_n_layer_window(lctx)[0] = n_layers; + +#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) + params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers); +#endif + } else { uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0}; @@ -1603,10 +1608,18 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { struct startup_args args; if (my_rank == 0){ args.should_profile = auto_schedule; + args.n_ctx = params.n_ctx; } + llama_bcast_startup_args(lctx, my_rank, &args); - auto_schedule = args.should_profile; + if (my_rank > 0) { + // receive startup args + auto_schedule = args.should_profile; + params.n_ctx = args.n_ctx; + cparams.n_ctx = args.n_ctx; + } + // if n_world > 1 and need auto schdule, then prifile if (auto_schedule){ // get device profile @@ -1658,6 +1671,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { cparams.n_gpu_layers = n_gpu_layers[my_rank]; mparams.n_gpu_layers = n_gpu_layers[my_rank]; llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); + } else { // -ngl is set + params.n_gpu_layers = std::min(params.n_gpu_layers, (int32_t)n_layer_window[my_rank]); + cparams.n_gpu_layers = params.n_gpu_layers; + mparams.n_gpu_layers = params.n_gpu_layers; + llama_model_set_n_gpu_layers(model, params.n_gpu_layers); } } @@ -1727,7 +1745,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (params.warmup) { - LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + LOG_WRN("%s: warming up the model with an empty run - please wait ...\n", __func__); const uint32_t my_rank = cparams.rank; std::vector tmp; diff --git a/common/profiler.h b/common/profiler.h index a685ff8c..2f3a20a6 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -313,7 +313,8 @@ struct disk_props { }; struct startup_args{ - bool should_profile; + bool should_profile; + uint32_t n_ctx; }; struct device_info { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13e54e50..dc949d54 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -116,7 +116,7 @@ struct server_task { }; struct server_task_result { - int id = -1; + int id = -1; json data; @@ -1063,6 +1063,9 @@ struct server_context { // clear the entire KV cache llama_kv_cache_clear(ctx); + + llama_send_kv_cache_clear(ctx); + clean_kv_cache = false; } @@ -1191,7 +1194,7 @@ struct server_context { SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } - // if context shift is disabled, we stop when it reaches the context limit + // we stop when it reaches the context limit, otherwise it may run forever if (slot.n_decoded >= slot.n_ctx) { slot.truncated = true; slot.stopped_limit = true; @@ -1917,8 +1920,11 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add (ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + + llama_send_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard); + llama_send_kv_cache_seq_add(ctx, slot.id , n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -2084,7 +2090,6 @@ struct server_context { // if input prompt is too big, truncate it (if group attention self-extend is disabled) if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; @@ -2161,12 +2166,14 @@ struct server_context { int p0 = (int) system_tokens.size() + slot.n_past; if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + llama_kv_cache_seq_rm (ctx, slot.id + 1, -1, -1); + llama_send_kv_cache_seq_rm(ctx, slot.id , -1, -1); p0 = (int) system_tokens.size(); if (p0 != 0) { // copy over the system prompt when there is one - llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); + llama_kv_cache_seq_cp (ctx, 0, slot.id + 1, -1, -1); + llama_send_kv_cache_seq_cp(ctx, 0, slot.id , -1, -1); } // there is no common part left (except for the system prompt) @@ -2175,6 +2182,8 @@ struct server_context { slot.ga_i = 0; // TODO: is the system prompt ever in the sampling context? gpt_sampler_reset(slot.smpl); + } else { + llama_send_kv_cache_seq_rm(ctx, slot.id, p0, -1); } // remove the non-common part from the cache @@ -2260,9 +2269,14 @@ struct server_context { SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); + llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i, slot.n_past_se, ib * bd); + + llama_kv_cache_seq_div (ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); + llama_send_kv_cache_seq_div(ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); + + llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); slot.n_past_se -= bd; @@ -3329,10 +3343,6 @@ int main(int argc, char ** argv) { // bind HTTP listen port, run the HTTP server in a thread if (!svr->bind_to_port(params.hostname, params.port)) { - //LOG_ERROR("couldn't bind HTTP server socket", { - // {"hostname", params.hostname}, - // {"port", params.port}, - //}); LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); clean_up(); return 1; @@ -3377,10 +3387,6 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.terminate(); }; - LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); - - ctx_server.queue_tasks.start_loop(); - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; @@ -3395,6 +3401,13 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + + ctx_server.queue_tasks.start_loop(); + + char * stop_signal = nullptr; + llama_free_sockets(ctx_server.ctx, &stop_signal); + clean_up(); t.join(); diff --git a/include/llama.h b/include/llama.h index fd4fec40..05b99624 100644 --- a/include/llama.h +++ b/include/llama.h @@ -283,7 +283,7 @@ extern "C" { uint32_t n_world; // number of nodes uint32_t rank; // my node rank uint32_t n_layer_window[32]; // number of layers to kept each time - int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs // main_gpu interpretation depends on split_mode: @@ -707,8 +707,10 @@ extern "C" { LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); // Clear the KV cache - both cell info is erased and KV data is zeroed - LLAMA_API void llama_kv_cache_clear( - struct llama_context * ctx); + LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx); + + // Notify other devices to clear their KV cache + LLAMA_API void llama_send_kv_cache_clear(struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails @@ -720,6 +722,13 @@ extern "C" { llama_seq_id seq_id, llama_pos p0, llama_pos p1); + + // Notify other nodes to remove a range from their KV cache + LLAMA_API void llama_send_kv_cache_seq_rm( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence @@ -731,6 +740,14 @@ extern "C" { llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); + + // Notify other nodes to copy a range of KV entries + LLAMA_API void llama_send_kv_cache_seq_cp( + struct llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); // Removes all tokens that do not belong to the specified sequence LLAMA_API void llama_kv_cache_seq_keep( @@ -750,6 +767,14 @@ extern "C" { llama_pos p1, llama_pos delta); + // Notify other nodes to shift (add) their KV cache entries + LLAMA_API void llama_send_kv_cache_seq_add( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); + // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() @@ -762,6 +787,14 @@ extern "C" { llama_pos p0, llama_pos p1, int d); + + // Notify other nodes to perform a division operation on a KV cache range + LLAMA_API void llama_send_kv_cache_seq_div( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); // Returns the largest position present in the KV cache for the specified sequence LLAMA_API llama_pos llama_kv_cache_seq_pos_max( diff --git a/src/llama.cpp b/src/llama.cpp index 718327e0..1f68ced0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -121,6 +121,17 @@ struct Timer { // helpers // +template +bool kv_cache_op(bool flag, + LocalFn local_fn, + RemoteFn remote_fn, + bool is_last_dev) { + if (!flag) return false; + local_fn(); + if (!is_last_dev) remote_fn(); + return true; +} + // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { size_t start = 0; @@ -4157,7 +4168,7 @@ static bool llama_kv_cache_find_slot( } if (n_tested >= cache.size) { - //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } } @@ -10629,7 +10640,7 @@ struct llm_build_context { cb(lctx.inp_K_shift, "K_shift", -1); ggml_set_input(lctx.inp_K_shift); - for (int il = 0; il < n_layer; ++il) { + for (int il = 0; il < (int)kv_self.k_l.size(); ++il) { const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); struct ggml_tensor * rope_factors = build_rope_factors(il); @@ -10642,13 +10653,19 @@ struct llm_build_context { struct ggml_tensor * tmp; if (ggml_is_quantized(k->type)) { + +#ifdef GGML_USE_METAL + GGML_ABORT("The option --cache-type-k is not supported on Metal\n"); +#endif + // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); cb(tmp, "K_f32", il); + for (auto * backend : lctx.backends) { // Figure out which backend KV cache belongs to if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) { - ggml_backend_sched_set_tensor_backend(lctx.sched.at(0), tmp, backend); // todo. + ggml_backend_sched_set_tensor_backend(lctx.sched[0], tmp, backend); break; } } @@ -17769,7 +17786,39 @@ struct input_tensors { }; struct sync_meta { - int32_t n_tokens = 0; + int32_t n_tokens = 0; + llama_pos * pos = nullptr; + uint32_t n_ctx = 0; + + // signal to clear the kv cache + bool clear_kv_cache = false; + + // signal to remove a kv cache sequence + bool kv_seq_rm = false; + llama_seq_id rm_seq_id = 0; + llama_pos rm_p0 = 0; + llama_pos rm_p1 = 0; + + // signal to add a kv cache sequence + bool kv_seq_add = false; + llama_seq_id add_seq_id = 0; + llama_pos add_p0 = 0; + llama_pos add_p1 = 0; + llama_pos add_delta = 0; + + // signal to copy a kv cache sequence + bool kv_seq_cp = false; + llama_seq_id cp_src_seq_id = 0; + llama_seq_id cp_dst_seq_id = 0; + llama_pos cp_p0 = 0; + llama_pos cp_p1 = 0; + + // signal to divide the kv cache range + bool kv_seq_div = false; + llama_seq_id div_seq_id = 0; + llama_pos div_p0 = 0; + llama_pos div_p1 = 0; + int div_factor = 1; }; static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) { @@ -17781,6 +17830,11 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) { send_msgs.emplace_back("n_tokens", strlen("n_tokens")); send_msgs.emplace_back(&(meta->n_tokens), sizeof(meta->n_tokens)); + if (meta->pos != nullptr) { + send_msgs.emplace_back("pos", strlen("pos")); + send_msgs.emplace_back(meta->pos, meta->n_ctx * sizeof(llama_pos)); + } + zmq::send_multipart(socket, send_msgs); } catch (const zmq::error_t& e) { LLAMA_LOG_INFO("Failed to send meta data: %s\n", e.what()); @@ -17797,6 +17851,49 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { socket.set(zmq::sockopt::rcvtimeo, -1); + const std::string cmd = recv_msgs[0].to_string(); + size_t idx = 1; + + if (cmd == "clear_kv_cache" && recv_msgs.size() == 1) { + meta->clear_kv_cache = true; + return 0; + } + + if (cmd == "kv_seq_rm" && recv_msgs.size() == 4) { + meta->kv_seq_rm = true; + std::memcpy(&meta->rm_seq_id, recv_msgs[idx++].data(), sizeof(meta->rm_seq_id)); + std::memcpy(&meta->rm_p0, recv_msgs[idx++].data(), sizeof(meta->rm_p0)); + std::memcpy(&meta->rm_p1, recv_msgs[idx++].data(), sizeof(meta->rm_p1)); + return 0; + } + + if (cmd == "kv_seq_add" && recv_msgs.size() == 5) { + meta->kv_seq_add = true; + std::memcpy(&meta->add_seq_id, recv_msgs[idx++].data(), sizeof(meta->add_seq_id)); + std::memcpy(&meta->add_p0, recv_msgs[idx++].data(), sizeof(meta->add_p0)); + std::memcpy(&meta->add_p1, recv_msgs[idx++].data(), sizeof(meta->add_p1)); + std::memcpy(&meta->add_delta, recv_msgs[idx++].data(), sizeof(meta->add_delta)); + return 0; + } + + if (cmd == "kv_seq_cp" && recv_msgs.size() == 5) { + meta->kv_seq_cp = true; + std::memcpy(&meta->cp_src_seq_id, recv_msgs[idx++].data(), sizeof(meta->cp_src_seq_id)); + std::memcpy(&meta->cp_dst_seq_id, recv_msgs[idx++].data(), sizeof(meta->cp_dst_seq_id)); + std::memcpy(&meta->cp_p0, recv_msgs[idx++].data(), sizeof(meta->cp_p0)); + std::memcpy(&meta->cp_p1, recv_msgs[idx++].data(), sizeof(meta->cp_p1)); + return 0; + } + + if (cmd == "kv_seq_div" && recv_msgs.size() == 5) { + meta->kv_seq_div = true; + std::memcpy(&meta->div_seq_id, recv_msgs[idx++].data(), sizeof(meta->div_seq_id)); + std::memcpy(&meta->div_p0, recv_msgs[idx++].data(), sizeof(meta->div_p0)); + std::memcpy(&meta->div_p1, recv_msgs[idx++].data(), sizeof(meta->div_p1)); + std::memcpy(&meta->div_factor, recv_msgs[idx++].data(), sizeof(meta->div_factor)); + return 0; + } + for (size_t i = 0; i < recv_msgs.size(); i += 2) { std::string key = recv_msgs[i].to_string(); zmq::message_t & data_msg = recv_msgs[i + 1]; @@ -17805,6 +17902,11 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens)); std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens)); } + + if (key == "pos") { + meta->pos = (llama_pos *) malloc(meta->n_ctx * sizeof(llama_pos)); + std::memcpy(meta->pos, data_msg.data(), meta->n_ctx * sizeof(llama_pos)); + } } return 0; } @@ -18069,15 +18171,66 @@ static int llama_decode_internal( } sync_meta meta; + meta.n_ctx = cparams.n_ctx; + bool is_last_dev = (my_rank == n_world - 1); + if (my_rank != 0) { if (llama_recv_meta(*lctx.recv_socket, &meta) == -1) { return -1; } - batch_all.n_tokens = meta.n_tokens; + + if (meta.n_tokens > 0) { + batch_all.n_tokens = meta.n_tokens; + if (meta.pos != nullptr) { + batch_all.pos = (llama_pos *) malloc(cparams.n_ctx * sizeof(llama_pos)); + std::memcpy(batch_all.pos, meta.pos, cparams.n_ctx * sizeof(llama_pos)); + } + } + + if (kv_cache_op(meta.clear_kv_cache, + [&]{ llama_kv_cache_clear (&lctx); }, + [&]{ llama_send_kv_cache_clear (&lctx); }, + is_last_dev)) { + LLAMA_LOG_INFO("%s: received signal kv_cache_clear\n", __func__); + return -1; + } + + if (kv_cache_op(meta.kv_seq_rm, + [&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, + [&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, + is_last_dev)) { + LLAMA_LOG_INFO("%s: received signal kv_cache_seq_rm\n", __func__); + return -1; + } + + if (kv_cache_op(meta.kv_seq_add, + [&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, + [&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, + is_last_dev)) { + LLAMA_LOG_INFO("%s: received signal kv_cache_seq_add\n", __func__); + return -1; + } + + if (kv_cache_op(meta.kv_seq_cp, + [&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, + [&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, + is_last_dev)) { + LLAMA_LOG_INFO("%s: received signal kv_cache_seq_cp\n", __func__); + return -1; + } + + if (kv_cache_op(meta.kv_seq_div, + [&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, + [&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, + is_last_dev)) { + LLAMA_LOG_INFO("%s: received signal kv_cache_seq_div\n", __func__); + return -1; + } } - if (my_rank != n_world - 1) { + if (!is_last_dev) { meta.n_tokens = batch_all.n_tokens; + meta.pos = batch_all.pos; llama_send_meta(*lctx.send_socket, &meta); } @@ -18803,22 +18956,20 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { - throw std::runtime_error("shift not supported\n"); - if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA GGML_ABORT("Deepseek2 does not support K-shift"); } - { - ggml_backend_sched_reset(lctx.sched.at(0)); // todo. + for (size_t i = 0; i < lctx.sched.size(); ++i) { + ggml_backend_sched_reset(lctx.sched[i]); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - ggml_backend_sched_alloc_graph(lctx.sched.at(0), gf); // todo. + ggml_backend_sched_alloc_graph(lctx.sched[i], gf); llama_set_k_shift(lctx); - llama_graph_compute(lctx, gf, lctx.sched.at(0), lctx.cparams.n_threads, lctx.threadpool); // todo. + llama_graph_compute(lctx, gf, lctx.sched[i], lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -18845,8 +18996,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // reserve a worst case graph again if (need_reserve) { - throw std::runtime_error("reserve not supported\n"); - // TODO: extract to a function // build worst-case graph uint32_t n_seqs = 1; // TODO: worst-case number of sequences @@ -18854,13 +19003,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; std::vector gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched[0]); // todo. + GGML_ASSERT(lctx.sched.size() == gf.size()); bool ok = true; - GGML_ASSERT(lctx.sched.size() == gf.size()); for (size_t i = 0; i < gf.size(); ++i) { + ggml_backend_sched_reset(lctx.sched[i]); ok = ok & ggml_backend_sched_reserve(lctx.sched[i], gf[i]); } if (!ok) { @@ -20201,6 +20348,8 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what()); exit(1); } + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set) { @@ -20264,36 +20413,47 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) { int32_t n_world = ctx->cparams.n_world; - if (n_world == 1) { - return 0; - } + GGML_ASSERT(n_world > 0); GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + if (rank == 0){ // send try { std::vector send_msgs; + send_msgs.emplace_back("should_profile", strlen("should_profile")); send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile)); + + send_msgs.emplace_back("n_ctx", strlen("n_ctx")); + send_msgs.emplace_back(&args->n_ctx, sizeof(args->n_ctx)); + zmq::send_multipart(*ctx->send_socket, send_msgs); } catch (const zmq::error_t& e) { LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); return -1; } - }else { + } else { // receive std::vector recv_msgs; if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { return -1; } + GGML_ASSERT(recv_msgs[0].to_string() == "should_profile"); GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); bool should_profile = *static_cast(recv_msgs[1].data()); args->should_profile = should_profile; + + GGML_ASSERT(recv_msgs[2].to_string() == "n_ctx"); + GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t)); + uint32_t n_ctx = *static_cast(recv_msgs[3].data()); + args->n_ctx = n_ctx; + if ((int)rank != (int)n_world - 1){ // send try { zmq::send_multipart(*ctx->send_socket, recv_msgs); - } catch (const zmq::error_t& e) { + } catch (const zmq::error_t & e) { LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); return -1; } @@ -21910,10 +22070,42 @@ void llama_kv_cache_clear(struct llama_context * ctx) { llama_kv_cache_clear(ctx->kv_self); } +void llama_send_kv_cache_clear(struct llama_context * ctx) { + if (ctx->send_socket == nullptr) { + return; + } + + try { + std::vector send_msgs; + const char * cmd = "clear_kv_cache"; + send_msgs.emplace_back(cmd, strlen(cmd)); + zmq::send_multipart(*ctx->send_socket, send_msgs); + } catch (const zmq::error_t & e) { + LLAMA_LOG_INFO("Failed to send KV cache clear signal: %s\n", e.what()); + } +} + bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); } +void llama_send_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + if (ctx->send_socket == nullptr) { + return; + } + + try { + std::vector msgs; + msgs.emplace_back("kv_seq_rm", strlen("kv_seq_rm")); + msgs.emplace_back(&seq_id, sizeof(seq_id)); + msgs.emplace_back(&p0, sizeof(p0)); + msgs.emplace_back(&p1, sizeof(p1)); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t & e) { + LLAMA_LOG_WARN("Failed to send kv_seq_rm: %s\n", e.what()); + } +} + void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { if (seq_id_src == seq_id_dst) { return; @@ -21921,6 +22113,24 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); } +void llama_send_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (ctx->send_socket == nullptr) { + return; + } + + try { + std::vector msgs; + msgs.emplace_back("kv_seq_cp", strlen("kv_seq_cp")); + msgs.emplace_back(&seq_id_src, sizeof(seq_id_src)); + msgs.emplace_back(&seq_id_dst, sizeof(seq_id_dst)); + msgs.emplace_back(&p0, sizeof(p0)); + msgs.emplace_back(&p1, sizeof(p1)); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t & e) { + LLAMA_LOG_WARN("Failed to send kv_seq_cp: %s\n", e.what()); + } +} + void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { llama_kv_cache_seq_keep(ctx->kv_self, seq_id); } @@ -21933,6 +22143,24 @@ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); } +void llama_send_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (ctx->send_socket == nullptr) { + return; + } + + try { + std::vector msgs; + msgs.emplace_back("kv_seq_add", strlen("kv_seq_add")); + msgs.emplace_back(&seq_id, sizeof(seq_id)); + msgs.emplace_back(&p0, sizeof(p0)); + msgs.emplace_back(&p1, sizeof(p1)); + msgs.emplace_back(&delta, sizeof(delta)); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t & e) { + LLAMA_LOG_WARN("Failed to send kv_seq_add: %s\n", e.what()); + } +} + void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { if (d == 1) { return; @@ -21941,6 +22169,24 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +void llama_send_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (ctx->send_socket == nullptr) { + return; + } + + try { + std::vector msgs; + msgs.emplace_back("kv_seq_div", strlen("kv_seq_div")); + msgs.emplace_back(&seq_id, sizeof(seq_id)); + msgs.emplace_back(&p0, sizeof(p0)); + msgs.emplace_back(&p1, sizeof(p1)); + msgs.emplace_back(&d, sizeof(d)); + zmq::send_multipart(*ctx->send_socket, msgs); + } catch (const zmq::error_t & e) { + LLAMA_LOG_WARN("Failed to send kv_seq_div: %s\n", e.what()); + } +} + llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); } From 421b3deca5c9c9fe1d70091ddc995b2eaff7b3fa Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Mon, 19 May 2025 18:08:27 +0400 Subject: [PATCH 26/31] fix llama-cli pos sync --- examples/main/main.cpp | 8 ++++++++ src/llama.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 39d4b60c..ccff70f2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -348,6 +348,9 @@ int main(int argc, char ** argv) { // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + if (my_rank == 0) { + llama_send_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + } } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -593,6 +596,11 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + if (my_rank == 0) { + llama_send_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_send_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + } + n_past -= n_discard; LOG_DBG("after swap: n_past = %d\n", n_past); diff --git a/src/llama.cpp b/src/llama.cpp index 1f68ced0..cd5a95b1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17788,6 +17788,8 @@ struct input_tensors { struct sync_meta { int32_t n_tokens = 0; llama_pos * pos = nullptr; + llama_pos all_pos_0; + llama_pos all_pos_1; uint32_t n_ctx = 0; // signal to clear the kv cache @@ -17835,6 +17837,12 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) { send_msgs.emplace_back(meta->pos, meta->n_ctx * sizeof(llama_pos)); } + send_msgs.emplace_back("all_pos_0", strlen("all_pos_0")); + send_msgs.emplace_back(&(meta->all_pos_0), sizeof(meta->all_pos_0)); + + send_msgs.emplace_back("all_pos_1", strlen("all_pos_1")); + send_msgs.emplace_back(&(meta->all_pos_1), sizeof(meta->all_pos_1)); + zmq::send_multipart(socket, send_msgs); } catch (const zmq::error_t& e) { LLAMA_LOG_INFO("Failed to send meta data: %s\n", e.what()); @@ -17907,6 +17915,16 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { meta->pos = (llama_pos *) malloc(meta->n_ctx * sizeof(llama_pos)); std::memcpy(meta->pos, data_msg.data(), meta->n_ctx * sizeof(llama_pos)); } + + if (key == "all_pos_0") { + GGML_ASSERT(data_msg.size() == sizeof(meta->all_pos_0)); + std::memcpy(&(meta->all_pos_0), data_msg.data(), sizeof(meta->all_pos_0)); + } + + if (key == "all_pos_1") { + GGML_ASSERT(data_msg.size() == sizeof(meta->all_pos_1)); + std::memcpy(&(meta->all_pos_1), data_msg.data(), sizeof(meta->all_pos_1)); + } } return 0; } @@ -18185,6 +18203,8 @@ static int llama_decode_internal( batch_all.pos = (llama_pos *) malloc(cparams.n_ctx * sizeof(llama_pos)); std::memcpy(batch_all.pos, meta.pos, cparams.n_ctx * sizeof(llama_pos)); } + batch_all.all_pos_0 = meta.all_pos_0; + batch_all.all_pos_1 = meta.all_pos_1; } if (kv_cache_op(meta.clear_kv_cache, @@ -18229,8 +18249,10 @@ static int llama_decode_internal( } if (!is_last_dev) { - meta.n_tokens = batch_all.n_tokens; - meta.pos = batch_all.pos; + meta.n_tokens = batch_all.n_tokens; + meta.pos = batch_all.pos; + meta.all_pos_0 = batch_all.all_pos_0; + meta.all_pos_1 = batch_all.all_pos_1; llama_send_meta(*lctx.send_socket, &meta); } From b30f749e5e2456f8e67136c3b19d7be157369ece Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 3 Jun 2025 14:06:31 +0400 Subject: [PATCH 27/31] fix n_embd cannot be divided by quantized block size --- common/common.cpp | 3 +++ common/profiler.cpp | 34 +++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 89707fb7..c90048dc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1588,6 +1588,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (n_world == 1) { uint32_t n_layers = llama_model_n_layers(model); + // assign all layers to this device params.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers; @@ -1596,6 +1597,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers); + cparams.n_gpu_layers = params.n_gpu_layers; + mparams.n_gpu_layers = params.n_gpu_layers; #endif } else { diff --git a/common/profiler.cpp b/common/profiler.cpp index b842071c..b54bb0be 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -48,6 +48,16 @@ #include +static int gcd_int(int a, int b) { + while (b != 0) { + int t = b; + b = a % b; + a = t; + } + return a; +} + + static size_t get_page_size() { size_t page_size = 0; @@ -154,8 +164,25 @@ uint32_t device_cpu_cores() { static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { int n_repeat = 1; - int n_embd = std::min(llama_n_embd(model), 4096); - if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu + int n_embd = std::min(llama_n_embd(model), 4096); + + // simulate small tensor calculation on cpu + if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; + + // ensure that the block sizes of the tensors are compatible + int bs0 = ggml_blck_size(src0t); + int bs1 = ggml_blck_size(src1t); + int gcd = gcd_int(bs0, bs1); + int lcm = bs0 / gcd * bs1; + + if (n_embd % bs0 != 0 || n_embd % bs1 != 0) { + if (n_embd < lcm) { + n_embd = 2 * lcm; + } else { + n_embd = 2 * (n_embd / lcm) * lcm; + } + } + std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -188,9 +215,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum }; struct ggml_context * ctx = ggml_init(params); - if(n_embd < ggml_blck_size(src0t)){ - n_embd = 2 * ggml_blck_size(src0t); - } struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); From 1b3b6a506f8538c0192fa659dcb524d394bee7c1 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 3 Jun 2025 17:10:09 +0400 Subject: [PATCH 28/31] fix: add warm-up in profiling to prevent init delay --- common/profiler.cpp | 5 ++++- src/llama.cpp | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index b54bb0be..a2ac33b5 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -439,7 +439,7 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in } // warm-up - // ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf); const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); @@ -1288,6 +1288,9 @@ static float device_mem_copy(struct llama_model * model, enum profiler_backend_t ggml_backend_cpu_set_n_threads(backend, n_threads); } + // warm-up + ggml_backend_graph_compute(backend, gf); + const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); const int64_t t_end = ggml_time_us(); diff --git a/src/llama.cpp b/src/llama.cpp index cd5a95b1..2cc8da15 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18211,7 +18211,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_clear (&lctx); }, [&]{ llama_send_kv_cache_clear (&lctx); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_clear\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_clear\n", __func__); return -1; } @@ -18219,7 +18219,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, [&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_rm\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_rm\n", __func__); return -1; } @@ -18227,7 +18227,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, [&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_add\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_add\n", __func__); return -1; } @@ -18235,7 +18235,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, [&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_cp\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_cp\n", __func__); return -1; } @@ -18243,7 +18243,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, [&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_div\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_div\n", __func__); return -1; } } From 64390909208c941f61e51b7358dc7a548ea19669 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 3 Jun 2025 23:53:24 +0400 Subject: [PATCH 29/31] reformat code --- Makefile | 2 +- common/common.cpp | 133 +++++++++++++++++++++-------------------- examples/main/main.cpp | 3 +- include/llama.h | 7 ++- src/llama.cpp | 122 ++++++++++++++++++------------------- 5 files changed, 137 insertions(+), 130 deletions(-) diff --git a/Makefile b/Makefile index 60cfc22f..06d91984 100644 --- a/Makefile +++ b/Makefile @@ -280,7 +280,7 @@ ifeq ($(USE_HIGHS),1) HIGHS_LDFLAGS = -L/usr/local/lib -lhighs ifeq ($(UNAME_S),Darwin) HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs - HIGHS_LDFLAGS += -L/opt/homebrew/lib -lhighs + HIGHS_LDFLAGS += -L/opt/homebrew/lib endif MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS MK_LDFLAGS += $(HIGHS_LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 378ab87f..0072996c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -847,8 +847,7 @@ static std::string vec_to_str(const std::vector & vec) { } static bool assign_layers_to_device( - uint32_t n_world, - uint32_t my_rank, + uint32_t n_world, const device_info * dev_info_set, uint32_t * n_layer_window, uint32_t * n_gpu_layers, @@ -857,15 +856,8 @@ static bool assign_layers_to_device( float min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s GGML_ASSERT(dev_info_set != nullptr); GGML_ASSERT(n_layer_window != nullptr); - GGML_ASSERT(my_rank == 0); - // if only 1 device, it is assigned all layers const uint32_t n_layer = llama_model_n_layers(model); - if (n_world == 1) { - n_layer_window[0] = n_layer; - return true; - } - std::vector w(n_world, 0); std::vector n(n_world, 0); std::vector mem_budget(n_world, 0.0f); @@ -1102,7 +1094,6 @@ static bool assign_layers_to_device( }; (void)print_matrix; - double final_objective = 1.0e30; std::vector final_solution; int final_k = -1; @@ -1442,7 +1433,6 @@ static bool assign_layers_to_device( // update the global best solution final_k = best_k; - final_objective = best_objective; final_solution = best_solution; if (solution_unchanged) break; @@ -1461,8 +1451,7 @@ static bool assign_layers_to_device( LOG_INF(" - N Layer Window : %d\n", w[m]); LOG_INF(" - N GPU Layers : %d\n", n[m]); } - // LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); - // LOG_INF("------------------------------------------"); + LOG_INF("\n"); // copy value from w and n to n_layer_window and n_gpu_layers, respectively std::copy(w.begin(), w.end(), n_layer_window); @@ -1522,58 +1511,67 @@ static bool assign_layers_to_device( return true; } -static bool tune_layer_allocation( - uint32_t n_world, - uint32_t my_rank, +static bool assign_layers_and_select_devices( + uint32_t n_world, std::vector dev_infos, uint32_t * n_layer_window, uint32_t * n_gpu_layers, struct llama_model * model, - const struct llama_context_params cparams, - float min_disk_read_speed = 0.1f) { + const struct llama_context_params cparams) { memset(n_layer_window, 0, n_world * sizeof(uint32_t)); - memset(n_gpu_layers, 0, n_world * sizeof(uint32_t)); + memset(n_gpu_layers, 0, n_world * sizeof(uint32_t)); + std::vector dev_infos_temp = dev_infos; - std::vector n_layer_windows_temp; - std::vector n_gpu_layers_temp; - while(n_world > 0) { + std::vector n_layer_windows_temp, n_gpu_layers_temp; + + while (n_world > 0) { std::vector dev_infos_ = dev_infos_temp; - std::vector n_layer_windows_(n_world, 0); - std::vector n_gpu_layers_(n_world, 0); - if (!assign_layers_to_device(n_world, my_rank, dev_infos_.data(), + std::vector n_layer_windows_(n_world, 0), n_gpu_layers_(n_world, 0); + + if (!assign_layers_to_device(n_world, dev_infos_.data(), n_layer_windows_.data(), n_gpu_layers_.data(), model, cparams)) { return false; } + dev_infos_temp.clear(); n_layer_windows_temp.clear(); n_gpu_layers_temp.clear(); - for(uint32_t i=0; i 1 || i==0 ) { + + for (uint32_t i = 0; i < n_world; i++) { + if (n_layer_windows_[i] > 1 || i == 0 ) { dev_infos_temp.push_back(dev_infos_[i]); n_layer_windows_temp.push_back(n_layer_windows_[i]); n_gpu_layers_temp.push_back(n_gpu_layers_[i]); + } else { + // remove this device + LOG_INF("Remove device %s (rank %d) with only %d layer assigned.\n", + dev_infos_[i].device_name, dev_infos_[i].rank, n_layer_windows_[i]); } } + if(dev_infos_temp.size() == n_world) { // no device be removed break; } n_world = dev_infos_temp.size(); + + LOG_INF("Reassign layers to the remaining %d device(s).\n\n", n_world); } - uint32_t i =0 , j =0; - while(j < n_world) { - if(dev_infos[i].rank == dev_infos_temp[j].rank){ + + uint32_t i = 0 , j = 0; + while (j < n_world) { + if (dev_infos[i].rank == dev_infos_temp[j].rank) { n_layer_window[i] = n_layer_windows_temp[j]; - n_gpu_layers[i] = n_gpu_layers_temp[j]; + n_gpu_layers[i] = n_gpu_layers_temp[j]; j++; - i++; } else { n_layer_window[i] = 0; n_gpu_layers[i] = 0; - i++; } + i++; } + return true; } @@ -1698,16 +1696,14 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_gather_device_info(lctx, dev_info_set.data()); device_print_props(dev_info_set.data(), n_world, model, cparams); - // automatically determine n_layer_window and n_gpu_layers - if (!tune_layer_allocation(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { + // assign layers to devices and remove weak devices + if (!assign_layers_and_select_devices(n_world, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); llama_free(lctx); llama_free_model(model); return iparams; } llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); - - //rebuild topo llama_rebuild_topo(lctx, n_layer_window, dev_info_set.data()); } else { // use the user-defined n_layer_window @@ -1718,51 +1714,58 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (auto_schedule){ llama_send_device_info(lctx, &dev_info); llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); - // rebuild topo - llama_rebuild_topo(lctx,n_layer_window, nullptr); - }else{ + llama_rebuild_topo (lctx, n_layer_window, nullptr); + } else { llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } } - if(n_layer_window[my_rank]<=0){ - LOG_INF("%s: info: rank %d has no layers to run, skipping\n", __func__, my_rank); + + // if this is a weak device, then exit + if (n_layer_window[my_rank] <= 0) { + LOG_INF("No layer is assigned to me, exit.\n"); llama_free(lctx); llama_free_model(model); exit(0); } - //update rank and n_world for consistency - uint32_t update_rank = 0; - uint32_t update_n_world = 1; - std::vector n_layer_window_temp = {n_layer_window[0]}; - std::vector n_gpu_layers_temp = {n_gpu_layers[0]}; - for(uint32_t i=1; i n_layer_window_temp = {n_layer_window[0]}, n_gpu_layers_temp = {n_gpu_layers[0]}; + + for (uint32_t i = 1; i < n_world; i++) { + if (n_layer_window[i] <= 0) { continue; } - if(i <= my_rank){ + if (i <= my_rank) { update_rank++; } update_n_world++; n_layer_window_temp.push_back(n_layer_window[i]); n_gpu_layers_temp.push_back(n_gpu_layers[i]); } - memset(n_layer_window, 0, n_world * sizeof(uint32_t)); - memset(n_gpu_layers, 0, n_world * sizeof(uint32_t)); - for (uint32_t i=0; icpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_F16)) { - dev_info->cpu_props.flops_f16_f32 = device_cpu_flops (model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); + dev_info->cpu_props.flops_f16_f32 = device_cpu_flops (model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) { - dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { - dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { - dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q5k_f32 = device_cuda_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32); + dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q5k_f32 = device_cuda_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q6_K)) { - dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) { - dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { - dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { - dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); + dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) { - dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1s_f32 = device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) { - dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq4nl_f32 = device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); } if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) { - dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1m_f32 = device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); } } @@ -7470,6 +7470,8 @@ static void llm_load_qwen2_tensors( const uint32_t * n_layer_window, bool * use_mmap_buffer, bool set_needed) { + (void)use_mmap_buffer; // unused in this function + const auto tn = LLM_TN(model.arch); ggml_context * ctx_input = nullptr; @@ -7487,8 +7489,7 @@ static void llm_load_qwen2_tensors( const llama_hparams hparams = model.hparams; const int64_t n_embd = hparams.n_embd; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - // const int64_t n_embd_gqa = n_embd_v_gqa; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_ff = hparams.n_ff(); const int64_t n_vocab = hparams.n_vocab; const int64_t n_layer = hparams.n_layer; @@ -20525,14 +20526,12 @@ int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_windo return 0; } -LLAMA_API int llama_rebuild_topo(llama_context *ctx, - uint32_t *n_layer_window, - device_info *dev_info_set) { +int llama_rebuild_topo(llama_context * ctx, uint32_t * n_layer_window, device_info * dev_info_set) { uint32_t n_world = ctx->cparams.n_world; uint32_t my_rank = ctx->cparams.rank; - device_info* dev_info_ptr = nullptr; - if (dev_info_set == nullptr){ - // for rank!=0, recv all devices info + device_info * dev_info_ptr = nullptr; + + if (dev_info_set == nullptr) { std::vector msgs; if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(msgs))) { return -1; @@ -20542,7 +20541,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, deserialize((const char *)msgs[i].data(), &dev_info_ptr[i]); } GGML_ASSERT(msgs.size() == n_world); - }else{ + } else { dev_info_ptr = dev_info_set; } @@ -20550,7 +20549,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, // notify next rank auto next_rank = (my_rank + 1) % n_world; - if(n_layer_window[next_rank] <= 0 && next_rank != 0){ + if (n_layer_window[next_rank] <= 0 && next_rank != 0) { try { auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); ctx->send_socket->set(zmq::sockopt::linger, 3500); @@ -20564,22 +20563,23 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } } - // check myself's layer - zmq::socket_t* socket_to_close = nullptr; - if(n_layer_window[my_rank] > 0) { + zmq::socket_t * socket_to_close = nullptr; + if (n_layer_window[my_rank] > 0) { // reconstruct socket to the next valid rank std::string next_ip; auto current_rank = my_rank; - while(next_rank!=my_rank){ - if(n_layer_window[next_rank] > 0){ + + while (next_rank != my_rank) { + if (n_layer_window[next_rank] > 0) { next_ip = dev_info_ptr[current_rank].next_ip; break; } - next_rank = (next_rank + 1) % n_world; + next_rank = (next_rank + 1) % n_world; current_rank = (current_rank + 1) % n_world; } - if(!next_ip.empty()){ - if((my_rank+1)%n_world != next_rank){ + + if (!next_ip.empty()) { + if ((my_rank + 1) % n_world != next_rank) { socket_to_close = ctx->send_socket; ctx->send_socket = new zmq::socket_t(*ctx->sock_context, zmq::socket_type::push); std::string send_endp = "tcp://" + next_ip + ":" + std::to_string(map_rank_to_port(next_rank, ctx->data_port)); @@ -20587,7 +20587,8 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, ctx->next_node_ip = next_ip; ctx->cparams.original_next_rank = next_rank; } - if(next_rank != 0){ + + if (next_rank != 0) { try { auto msgs = dev_infos_to_messages(dev_info_ptr, n_world); zmq::send_multipart(*ctx->send_socket, msgs); @@ -20599,18 +20600,21 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, return -1; } } - }else{ + } else { // only one node ctx->next_node_ip = ""; } } - if(!dev_info_set){ + + if (!dev_info_set) { delete[] dev_info_ptr; } + if(socket_to_close != nullptr){ socket_to_close->close(); delete socket_to_close; } + return 0; } @@ -20675,11 +20679,9 @@ void llama_free_sockets(struct llama_context * ctx, char ** msg) { } } -void llama_update_context_with_rankworld(struct llama_context * ctx, - uint32_t rank, - uint32_t n_world) { - if(ctx) { - ctx->cparams.rank = rank; +void llama_update_context_with_rankworld(struct llama_context * ctx, uint32_t rank, uint32_t n_world) { + if (ctx) { + ctx->cparams.rank = rank; ctx->cparams.n_world = n_world; } } From 27756ee182e5cb7231a610ca2f06d8d3bdbb602a Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 4 Jun 2025 15:11:29 +0400 Subject: [PATCH 30/31] fix: enable rolling back set assignment when all devices are assigned to M4 but no feasible solutions --- common/common.cpp | 49 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 0072996c..dff98506 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1094,8 +1094,8 @@ static bool assign_layers_to_device( }; (void)print_matrix; - std::vector final_solution; - int final_k = -1; + std::vector final_solution, rollback_solution; + int final_k = -1, rollback_k = -1; // iterative optimization to find a valid set assignment (M1, M2, M3, M4) while (true) { @@ -1367,19 +1367,48 @@ static bool assign_layers_to_device( // get the solution const HighsModelStatus& model_status = highs.getModelStatus(); - if (model_status != HighsModelStatus::kOptimal) continue; + + if (model_status != HighsModelStatus::kOptimal) { + bool is_all_in_M4 = true; + for (uint32_t m = 0; m < n_world; ++m) { + if (!in_set(m, M4)) { + is_all_in_M4 = false; + break; + } + } + if (!is_all_in_M4) continue; + } // record the best solution const HighsSolution& solution = highs.getSolution(); double objective_value = highs.getInfo().objective_function_value; - if (objective_value < best_objective) { - best_objective = objective_value; - best_k = k; - best_solution = solution.col_value; - } - LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n", - k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str()); + if (solution.value_valid) { + if (objective_value < best_objective) { + best_objective = objective_value; + best_k = k; + best_solution = solution.col_value; + } + LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n", + k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str()); + } + } + + if (best_solution.empty()) { + LOG_INF("No feasible solution found for this set assignment, rolling back to previous sets.\n"); + + final_solution = rollback_solution; + final_k = rollback_k; + + // update w[m] and n[m] + GGML_ASSERT(final_solution.size() == n_world * 2 && "Invalid solution\n"); + std::copy(final_solution.begin(), final_solution.begin() + n_world, w.begin()); + std::copy(final_solution.begin() + n_world, final_solution.end(), n.begin()); + + break; + } else { + rollback_solution = best_solution; + rollback_k = best_k; } // check the solution From ef1e10101e6150ad0eed624dc496ea55cf301484 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 4 Jun 2025 15:12:00 +0400 Subject: [PATCH 31/31] add test for IQ1 and doc for device selection --- README.md | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 17661e2e..fd2c3b2e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ And, if your devices are more powerful, you could unlock even more possibilities > Device D4 runs inside a Termux-simulated Linux. Device D1 reads disk data in random mode and D2~D4 read in sequential mode. -**Table 2:** Token latency for Llama models. +**Table 2:** Token latency for Llama models (w/o device selection). | **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** | |-----------------|---------------|-----------|------------|---------------| | Llama 3-8B | **15 ms** | 263 ms | 459 ms | 54 ms | @@ -45,7 +45,7 @@ And, if your devices are more powerful, you could unlock even more possibilities | Llama 1-65B | 8807 ms | - | - | **569 ms** | | Llama 3-70B | 10120 ms | OOM | OOM | **674 ms** | -**Table 3:** Token latency for Qwen 2.5, QwQ, and DeepSeek R1 models. +**Table 3:** Token latency for Qwen 2.5, QwQ, and DeepSeek R1 models (w/o device selection). | **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** | |-----------------------------------|---------------|---------------|------------|---------------| @@ -61,7 +61,9 @@ And, if your devices are more powerful, you could unlock even more possibilities > As video recording consumes some RAM, prima.cpp proactively reduces memory usage, resulting in slightly higher latency in the video compared to the table. -> In current implementation, each device is assigned at least one model layer. For example, this leads to a 1:1:29:1 split for Llama 3-8B, which makes prima.cpp less efficient. In future updates, we will have a 0:0:32:0 split and idle devices removed, then llama.cpp would become a special case of prima.cpp when serving small models. +> In the old version (w/o device selection), each device is assigned at least one model layer. This would lead to a 1:1:29:1 split for Llama 3-8B, which makes prima.cpp slower than llama.cpp. +> +> **New:** In the latest version (with device selection), we will have a 0:0:32:0 split and weak devices removed, then prima.cpp would become llama.cpp when serving small models. ## 🔑 Key Features @@ -70,6 +72,7 @@ And, if your devices are more powerful, you could unlock even more possibilities - - **GPU & CPU Offloading:** If a device has a GPU, you can use both GPU and CPU for inference. For example, when VRAM is full, we can offload some model layers to RAM. - - **Piped-ring parallelism with prefetching:** Prefetch upcoming layer weights to overlap disk loading latency and use advanced piped-ring parallelism to prevent the "prefetch-release" effect. This new parallelism improves pipeline parallelism by using a ring structure and allows devices to run multiple cycles to predict a new token. - - **Heterogeneity-aware workload distribution:** A scheduler is designed to optimize workload distribution based on each device's computing power, disk speed, memory, and OS (the OS will affect the disk speed and the memory management strategy). It decides how many model layers a device should handle and how many should run on GPU (if available). +- - **Automatic device selection:** If there are weak devices and removing them would speed up inference, prima.cpp will automatically discover and remove them. - - **Quantization:** We now support Q4K, Q6K, Q80 and IQ1 quantization (GGUF format) and are exploring a Q4K-IQ1 hybrid for a better balance between performance and speed. - **Support Models:** We now support hot models like the **Llama, Qwen (and QwQ), and DeepSeek series**. More will be added in future updates. - **Cross-Platform:** The cluster can consist of devices with different OSs, including macOS, Linux, Android, HarmonyOS, etc. Now, Android and HarmonyOS devices require Termux, and Windows support will be added in future update. @@ -78,26 +81,26 @@ And, if your devices are more powerful, you could unlock even more possibilities Here are the models we have tested so far. You can also try more on Hugging Face! ### Llama -- **Llama 3-8B (Q4K, Q6K, Q80):** [Meta-Llama-3-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF) +- **Llama 3-8B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/LLama-3-8b-Uncensored-i1-GGUF)):** [Meta-Llama-3-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF) - **Llama 3-14B (Q4K, Q6K, Q80):** [Llama-3-14B-Instruct-v1](https://huggingface.co/RDson/Llama-3-14B-Instruct-v1-GGUF) -- **Llama 1-30B (Q4K, Q6K, Q80):** [upstage-llama-30b-instruct-2048](https://huggingface.co/TheBloke/upstage-llama-30b-instruct-2048-GGUF) -- **Llama 3-45B (Q4K, Q6K, Q80):** [Llama-3-pruned-45B-Drobeta-Turnu-Severin](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-GGUF) -- **Llama 3-60B (Q4K, Q6K, Q80):** [nyun-llama3-60B](https://huggingface.co/mradermacher/nyun-llama3-60B-GGUF) -- **Llama 1-65B (Q4K, Q6K, Q80):** [llama-65b](https://huggingface.co/TheBloke/LLaMA-65B-GGUF) -- **Llama 3-70B (Q4K, Q6K, Q80):** [Meta-Llama-3-70B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-70B-Instruct-GGUF) +- **Llama 1-30B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/LLaMA-30B-HF-i1-GGUF)):** [upstage-llama-30b-instruct-2048](https://huggingface.co/TheBloke/upstage-llama-30b-instruct-2048-GGUF) +- **Llama 3-45B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-i1-GGUF)):** [Llama-3-pruned-45B-Drobeta-Turnu-Severin](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-GGUF) +- **Llama 3-60B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/nyun-llama3-60B-i1-GGUF)):** [nyun-llama3-60B](https://huggingface.co/mradermacher/nyun-llama3-60B-GGUF) +- **Llama 1-65B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/llama-65b-instruct-i1-GGUF)):** [llama-65b](https://huggingface.co/TheBloke/LLaMA-65B-GGUF) +- **Llama 3-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Meta-Llama-3-70B-Instruct-DPO-i1-GGUF)):** [Meta-Llama-3-70B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-70B-Instruct-GGUF) ### Qwen 2.5 / QwQ -- **Qwen 2.5-7B (Q4K, Q6K, Q80):** [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF) -- **Qwen 2.5-14B (Q4K, Q6K, Q80):** [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GGUF) -- **Qwen 2.5-32B (Q4K, Q6K, Q80):** [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GGUF) -- **Qwen 2.5-72B (Q4K, Q6K, Q80):** [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) -- **QwQ-32B (Q4K, Q6K, Q80):** [qwq-32b](https://huggingface.co/Qwen/QwQ-32B-GGUF) +- **Qwen 2.5-7B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-7B-i1-GGUF)):** [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF) +- **Qwen 2.5-14B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-14B-i1-GGUF)):** [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GGUF) +- **Qwen 2.5-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-32B-i1-GGUF)):** [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GGUF) +- **Qwen 2.5-72B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-72B-Instruct-i1-GGUF)):** [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) +- **QwQ-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/QwQ-32B-i1-GGUF)):** [qwq-32b](https://huggingface.co/Qwen/QwQ-32B-GGUF) ### DeepSeek -- **DeepSeek R1-7B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-7B-GGUF) -- **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF) -- **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF) -- **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF) +- **DeepSeek R1-7B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/DeepSeek-R1-Distill-Qwen-7B-Uncensored-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-7B-GGUF) +- **DeepSeek R1-8B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/DeepSeek-R1-Distill-Llama-8B-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF) +- **DeepSeek R1-14B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-14B-DeepSeek-R1-1M-Uncensored-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF) +- **DeepSeek R1-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/deepseek-r1-qwen-2.5-32B-ablated-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF) - **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) ## ⚙️ How to Use? @@ -154,6 +157,7 @@ make GGML_CUDA=1 -j$(nproc) make LLAMA_NO_METAL=1 -j$(nproc) # To enable debug mode, add LLAMA_DEBUG=1: +# WARNING: Running in DEBUG mode will slow down inference! make LLAMA_DEBUG=1 -j$(nproc) # Otherwise, just use: @@ -356,6 +360,10 @@ Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device. +**7. Why did I get "No layer is assigned to me, exit"?** + +No worries, this is expected. Prima.cpp found that this device was too slow, and dropping it could speed up inference, so it was removed. + ## ❤️ Acknowledgment This project builds upon the incredible work from the open-source community, especially [ggml, gguf](https://github.com/ggml-org/ggml), and [llama.cpp](https://github.com/ggml-org/llama.cpp). We gratefully acknowledge their contributions.