From 678d7994f4da0af3d29046be99950ac999ee9762 Mon Sep 17 00:00:00 2001 From: Ting Lou Date: Fri, 29 Nov 2024 08:09:46 +0800 Subject: [PATCH 01/10] llava: return false instead of exit (#10546) --- examples/llava/clip.cpp | 15 +++++++++++---- examples/llava/llava.cpp | 28 +++++++++++++++++++--------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index aae49c965..7ba4cea58 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -40,10 +40,17 @@ #include #include -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) //#define CLIP_DEBUG_FUNCTIONS diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index be6988540..4ca53a0b8 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -11,13 +11,17 @@ #include #include -#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) -#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) - -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) // RGB uint8 image struct clip_image_u8 { @@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long errno = 0; size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer if (ferror(file)) { - die_fmt("read error: %s", strerror(errno)); + LOG_ERR("read error: %s", strerror(errno)); + free(buffer); + fclose(file); + return false; } if (ret != (size_t) fileSize) { - die("unexpectedly reached end of file"); + LOG_ERR("unexpectedly reached end of file"); + free(buffer); + fclose(file); + return false; } fclose(file); // Close the file From f095a649ec390e04dfab1b04e646ae8549dafaef Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 29 Nov 2024 00:18:02 -0600 Subject: [PATCH 02/10] vulkan: get the first command buffer submitted sooner (#10499) This is an incremental improvement over #9118 to get work to the GPU a bit sooner. The first part is to start with a smaller number of nodes before the first submit, and ramp it up to the current 100 nodes/submit. The second part is to reduce the dryrun overhead for all the nodes that just need to request descriptor space. With these changes I get around 1-2% speedup on RTX 4070 combined with my old Haswell-era CPU. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 60 ++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a833007fb..849c11923 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5672,6 +5672,48 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } else { compute_ctx = ctx->compute_ctx.lock(); } + } else { + switch (node->op) { + case GGML_OP_REPEAT: + case GGML_OP_ACC: + case GGML_OP_GET_ROWS: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_CONCAT: + case GGML_OP_UPSCALE: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_SIN: + case GGML_OP_COS: + case GGML_OP_CLAMP: + case GGML_OP_PAD: + case GGML_OP_CPY: + case GGML_OP_CONT: + case GGML_OP_DUP: + case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_UNARY: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ARGSORT: + case GGML_OP_SUM_ROWS: + case GGML_OP_IM2COL: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_POOL_2D: + case GGML_OP_LEAKY_RELU: + { + // These operations all go through ggml_vk_op_f32, so short-circuit and + // do the only thing needed for the dryrun. + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op); + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return false; + } + default: + break; + } } switch (node->op) { @@ -6401,16 +6443,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool first_node_in_batch = true; // true if next node will be first node in a batch int submit_node_idx = 0; // index to first node in a batch - // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution - constexpr int submit_count = 100; + // Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution. + // Start with a smaller count to get work submitted right away, and increase it after each submit. + int nodes_per_submit = 20; int submitted_nodes = 0; + int submit_count = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (first_node_in_batch) { submit_node_idx = i; } - bool submit = (submitted_nodes >= submit_count) || (i == last_node); - + bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node); bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); @@ -6427,6 +6470,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (submit) { first_node_in_batch = true; submitted_nodes = 0; + switch (submit_count) { + case 0: + nodes_per_submit = 50; + break; + default: + nodes_per_submit = 100; + break; + } + submit_count++; } } From 938f6087421889a3af7d0786c64406ced2be81b8 Mon Sep 17 00:00:00 2001 From: Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Date: Fri, 29 Nov 2024 14:46:55 +0800 Subject: [PATCH 03/10] CANN: RoPE operator optimization (#10563) * [cann] RoPE operator optimization * [CANN]Code Formatting --------- Co-authored-by: noemotiovon --- ggml/src/ggml-cann/aclnn_ops.cpp | 241 ++++++++++++++++++++++++++++--- ggml/src/ggml-cann/ggml-cann.cpp | 13 +- 2 files changed, 222 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index d707efc5d..b2d857e1e 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2965,7 +2965,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_cos_repeat_tensor, aclTensor* acl_sin_repeat_tensor, float theta_scale, float freq_scale, - bool is_neox) { + float attn_factor, bool is_neox) { // int sin/cos cache, cache has different repeat method depond on // @param.is_neox @@ -3017,6 +3017,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS); aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, nullptr, true); + ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor)); } // position @@ -3047,16 +3048,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor); - // // power[] * position[] * freq_scale / freq_factors[] - // ggml_cann_pool_alloc theta_final_allocator(ctx.pool(), - // theta_length * - // sizeof(float_t)); - // aclTensor* acl_theat_final_tensor = aclnn_zero( - // ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length, - // theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t)); - // aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor, - // acl_freq_factors_tensor, freq_scale); - // permute: [0,1,2,3]->[0,2,1,3] int64_t permute_ne[] = {arange_length, 1, position_length, 1}; size_t permute_nb[GGML_MAX_DIMS]; @@ -3092,6 +3083,12 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); + // attn_factor + if (attn_factor != 1) { + aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); + aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true); + } + // repeat if (is_neox) { int64_t repeatsArray[] = {1, 1, 1, 2}; @@ -3155,15 +3152,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); - // TODO: attn_factor != 1 - GGML_ASSERT(attn_factor == 1); // TODO: n_dims <= ne0 GGML_ASSERT(n_dims == ne0); GGML_ASSERT(n_dims % 2 == 0); // TODO: ext_factor != 0 GGML_ASSERT(ext_factor == 0); - // TODO: type == GGML_TYPE_F16 - GGML_ASSERT(src0->type == GGML_TYPE_F32); const float theta_scale = powf(freq_base, -2.0f / n_dims); @@ -3194,7 +3187,217 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, - theta_scale, freq_scale, is_neox); + theta_scale, freq_scale, attn_factor, is_neox); + + aclTensor* acl_src = ggml_cann_create_tensor(src0); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + +#ifdef ASCEND_310P + // Special ROPE operation for 310P + + // roll input + void* input_roll_buffer; + aclTensor* acl_minus_one_tensor; + void* minus_one_scale_buffer = nullptr; + ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); + ggml_cann_pool_alloc minus_one_scale_allocator( + ctx.pool(), sizeof(float_t) * src0->ne[0]); + if (!is_neox) { + // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] + input_roll_buffer = roll_allocator.get(); + int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), + src0->ne[2], src0->ne[3]}; + size_t input_roll_nb[GGML_MAX_DIMS]; + input_roll_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; + } + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + src0->data, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + + int64_t shifts[] = {1}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + + // init [-1, 1, -1, 1, ...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + int64_t dim = 3; + int64_t* index = new int64_t[src0->ne[0]]; + for (int i = 0; i < src0->ne[0]; i++) { + index[i] = i / 2 * 2; + } + int64_t index_num = src0->ne[0]; + float value = -1; + aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, + index_num, value); + } else { + // roll input: [q0,q1,q2,...] -> + // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] + input_roll_buffer = roll_allocator.get(); + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); + + int64_t shifts[] = {src0->ne[0] / 2}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + // init [-1, -1, -1, 1, 1,1,...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + // -1 * first half + int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; + size_t first_half_nb[GGML_MAX_DIMS]; + first_half_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; + } + aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( + minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, + first_half_nb, GGML_MAX_DIMS); + bool inplace = true; + float scale = -1; + aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); + ACL_CHECK(aclDestroyTensor(acl_first_half_tensor)); + } + + // TODO: n_dims < ne0 + GGML_ASSERT(n_dims == src0->ne[0]); + + // input * scale + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), + ggml_nbytes(src0)); + void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); + size_t input_nb[GGML_MAX_DIMS]; + input_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( + input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + + aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, + acl_input_roll_mul_scale_tensor); + + // output + void* output_fp32_buffer; + if (src0->type == GGML_TYPE_F32) { + aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor); + aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor); + aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst); + // TODO: ne0 != n_dims in mode2 + } else if (src0->type == GGML_TYPE_F16) { + size_t input_fp32_nb[GGML_MAX_DIMS]; + input_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; + } + ggml_cann_pool_alloc fp32_allocator1( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer1 = fp32_allocator1.get(); + aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( + input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + ggml_cann_pool_alloc fp32_allocator2( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer2 = fp32_allocator2.get(); + aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( + input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + + ggml_cann_pool_alloc fp32_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + output_fp32_buffer = fp32_allocator.get(); + aclTensor* output_fp32_tensor = ggml_cann_create_tensor( + output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, + input_fp32_tensor2); + aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, + output_fp32_tensor); + aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); + + ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); + ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); + ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + } + return; +#endif + + // src0 == GGML_TYPE_F16 + // TODO: optimization this `if` code + if (src0->type == GGML_TYPE_F16) { + ggml_cann_pool_alloc sin_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + ggml_cann_pool_alloc cos_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + void* sin_final_buffer = sin_final_allocator.get(); + void* cos_final_buffer = cos_final_allocator.get(); + + int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; + size_t sin_final_nb[GGML_MAX_DIMS]; + sin_final_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1]; + } + aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor( + sin_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor( + cos_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor, + ggml_cann_type_mapping(src0->type)); + aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor, + ggml_cann_type_mapping(src0->type)); + ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + acl_sin_reshape_tensor = acl_sin_final_tensor; + acl_cos_reshape_tensor = acl_cos_final_tensor; + } uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -3206,10 +3409,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_mode = 1; } - aclTensor* acl_x = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( - acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, + acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); @@ -3219,7 +3420,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, executor, ctx.stream())); - ACL_CHECK(aclDestroyTensor(acl_x)); + ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst)); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index bcb54e444..04e25b8ab 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1739,7 +1739,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_ROPE: { // TODO: with ops-test v == 1 float * ext_factor = (float*)((int32_t*)op->op_params + 7); - float * attn_factor = (float*)((int32_t*)op->op_params + 8); // TODO: n_dims <= ne0 if (op->src[0]->ne[0] != op->op_params[1]) { return false; @@ -1748,17 +1747,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, if (*ext_factor != 0) { return false; } - // TODO: attn_factor != 1 - if (*attn_factor != 1) { - return false; - } - //TODO: type == GGML_TYPE_F16 - switch (op->src[0]->type) { - case GGML_TYPE_F32: - return true; - default: - return false; - } + return true; } case GGML_OP_UPSCALE: { // aclnnUpsampleNearest2dGetWorkspaceSize not support From 266b8519ee6d21e7ba2bf56f5629e20a181fee8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Fri, 29 Nov 2024 09:49:43 +0000 Subject: [PATCH 04/10] sycl : Reroute permuted mul_mats through oneMKL (#10408) This PR fixes the failing MUL_MAT tests for the sycl backend. --- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b6392ed8d..aabcdc224 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3447,8 +3447,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - // KQ single-batch - ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + // TODO: Refactor and cleanup of mul mat dispatching. + if (src0->ne[3] == 1 && src1->ne[3] == 1) { + // KQ single-batch + // mmv p021 was specific for these dimensions + ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + } else { + // The kernel from the if path is faster for that specific case, but does not support all mul mats. + ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + } } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst); From 0f77aae5608f16780a49926b67be6d56ec4b09bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Fri, 29 Nov 2024 12:38:45 +0000 Subject: [PATCH 05/10] sycl : offload of get_rows set to 0 (#10432) --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index aabcdc224..808f74fa0 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4493,7 +4493,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_ static int64_t get_op_batch_size(const ggml_tensor * op) { switch (op->op) { case GGML_OP_GET_ROWS: - return op->ne[1]; // this will increse the speed of prefill in test + return 0; case GGML_OP_MUL_MAT: return op->ne[1]; case GGML_OP_MUL_MAT_ID: From 4b3242bbea172ac0980378496fbc676d44c4f459 Mon Sep 17 00:00:00 2001 From: Shupei Fan Date: Fri, 29 Nov 2024 21:49:02 +0800 Subject: [PATCH 06/10] ggml-cpu: fix typo in gemv/gemm iq4_nl_4_4 (#10580) --- ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 69d3d327d..14a1f00eb 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -1020,7 +1020,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void float * res_ptr = s; for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); float32x4_t sumf = vdupq_n_f32(0); for (int l = 0; l < nb; l++) { @@ -3507,7 +3507,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void for (int y = 0; y < nr / 4; y++) { const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); for (int x = 0; x < nc / ncols_interleaved; x++) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); + const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); float32x4_t sumf[4]; for (int m = 0; m < 4; m++) { From f0678c5ff4cb8873d6ff48801475ff270db656fa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Nov 2024 16:25:39 +0200 Subject: [PATCH 07/10] ggml : fix I8MM Q4_1 scaling factor conversion (#10562) ggml-ci --- ggml/src/ggml-cpu/ggml-cpu-quants.c | 57 +++++++++++++++++------------ ggml/src/ggml-cpu/ggml-cpu.c | 4 +- tests/test-backend-ops.cpp | 2 + 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 11e8df253..634c5fa11 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -1791,11 +1791,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r const int8x16_t y1_l = vld1q_s8(b_y1->qs); const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; - + float32_t _scale[4] = { + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + }; float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -1811,7 +1812,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); + l1, r1)), l2, r2)), l3, r3))), scale); } float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); @@ -2347,10 +2348,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const block_q8_1 * restrict b_y0 = &vy0[i]; const block_q8_1 * restrict b_y1 = &vy1[i]; - float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)}; + float32_t summs_t[4] = { + GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), + GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), + GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), + GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s) + }; summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -2371,10 +2374,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); // mmla into int32x4_t - float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d, - GGML_FP16_TO_FP32(b_x0->d)*b_y1->d, - GGML_FP16_TO_FP32(b_x1->d)*b_y0->d, - GGML_FP16_TO_FP32(b_x1->d)*b_y1->d}; + float32_t _scale[4] = { + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + }; float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -2389,15 +2394,17 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); + l1, r1)), l2, r2)), l3, r3))), scale); } - float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + sumv2 = vaddq_f32(sumv2, summs0); vst1_f32(s, vget_low_f32 (sumv2)); vst1_f32(s + bs, vget_high_f32(sumv2)); + return; } #endif @@ -3374,10 +3381,12 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r const int8x16_t y1_l = vld1q_s8(b_y1->qs); const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); - float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + float32_t _scale[4] = { + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + }; float32x4_t scale = vld1q_f32(_scale); int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -3393,13 +3402,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), - l1, r1)), l2, r2)), l3, r3))), scale); + l1, r1)), l2, r2)), l3, r3))), scale); } - float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + + float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); - vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s, vget_low_f32 (sumv2)); vst1_f32(s + bs, vget_high_f32(sumv2)); + return; } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1c88e5d81..e0cefc20b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -7641,8 +7641,8 @@ UseGgmlGemm2:; // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols int64_t num_rows_per_vec_dot = vec_dot_num_rows; - // TODO: currently the mmla kernels support only even numbered rows/cols. - // this check can be removed once they are extended to support odd numbered rows/cols too + // these checks are needed to avoid crossing dim1 boundaries + // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { num_rows_per_vec_dot = 1; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index da66ed856..4d9df1a64 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3334,7 +3334,9 @@ static const ggml_type all_types[] = { static const ggml_type base_types[] = { GGML_TYPE_F32, GGML_TYPE_F16, + GGML_TYPE_Q8_0, // for I8MM tests GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, // for I8MM tests GGML_TYPE_Q4_K, GGML_TYPE_IQ2_XXS }; From a3a3048e7a0f9464d0d625a29257d8bce5da5090 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Fri, 29 Nov 2024 17:45:08 +0100 Subject: [PATCH 08/10] cleanup UI link list (#10577) * cleanup UI link list * sort list alphabetically * add missing licenses --- README.md | 58 ++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 414c5b1c0..8aa3d1e59 100644 --- a/README.md +++ b/README.md @@ -140,44 +140,36 @@ Typically finetunes of the base models below are supported as well. **UI:** -Unless otherwise noted these projects are open-source with permissive licensing: - -- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) -- [iohub/collama](https://github.com/iohub/coLLaMA) +- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) +- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) +- [Dot](https://github.com/alexpinel/Dot) (GPL) +- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) +- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0) - [janhq/jan](https://github.com/janhq/jan) (AGPL) -- [nat/openplayground](https://github.com/nat/openplayground) -- [Faraday](https://faraday.dev/) (proprietary) +- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0) +- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) +- [LARS](https://github.com/abgulati/LARS) (AGPL) +- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) +- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) - [LMStudio](https://lmstudio.ai/) (proprietary) -- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) -- [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) -- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) -- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) -- [ollama/ollama](https://github.com/ollama/ollama) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) -- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) -- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) -- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) -- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) -- [RAGNA Desktop](https://ragna.app/) (proprietary) -- [RecurseChat](https://recurse.chat/) (proprietary) -- [semperai/amica](https://github.com/semperai/amica) -- [withcatai/catai](https://github.com/withcatai/catai) -- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) -- [Msty](https://msty.app) (proprietary) -- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) -- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) -- [Dot](https://github.com/alexpinel/Dot) (GPL) - [MindMac](https://mindmac.app) (proprietary) -- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) -- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) -- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) -- [AIKit](https://github.com/sozercan/aikit) (MIT) -- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) -- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) -- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) -- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT) +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0) +- [nat/openplayground](https://github.com/nat/openplayground) (MIT) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT) +- [ollama/ollama](https://github.com/ollama/ollama) (MIT) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) +- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT) +- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [ramalama](https://github.com/containers/ramalama) (MIT) +- [semperai/amica](https://github.com/semperai/amica) (MIT) +- [withcatai/catai](https://github.com/withcatai/catai) (MIT) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* From 3a8e9af402f7893423bdab444aa16c5d9a2d429a Mon Sep 17 00:00:00 2001 From: Robert Collins Date: Fri, 29 Nov 2024 12:21:37 -0500 Subject: [PATCH 09/10] imatrix : support combine-only (#10492) * imatrix-combine-only idea * ensured that behavior consistent with log --- examples/imatrix/imatrix.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 70ff47768..45206f4a7 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,10 +637,19 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - if (!compute_imatrix(ctx, params)) { - return 1; + if (params.prompt.empty()) { + if (params.in_files.empty()) { + LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); + return 1; + } + LOG_INF("No prompt provided; combining precomputed matrices only.\n"); + } else { + if (!compute_imatrix(ctx, params)) { + return 1; + } } + g_collector.save_imatrix(); LOG("\n"); From b782e5c7d453b3f1fa8dc6c34cde7e2fa946af93 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 29 Nov 2024 21:48:56 +0100 Subject: [PATCH 10/10] server : add more test cases (#10569) * server : add split model test * add test speculative * add invalid cases --- examples/server/tests/unit/test_basic.py | 14 +++ .../server/tests/unit/test_chat_completion.py | 19 ++++ examples/server/tests/unit/test_infill.py | 22 ++++ examples/server/tests/unit/test_rerank.py | 17 +++ .../server/tests/unit/test_speculative.py | 103 ++++++++++++++++++ examples/server/tests/utils.py | 12 +- 6 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 examples/server/tests/unit/test_speculative.py diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index 84db5ca1c..d82d54a5a 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -32,3 +32,17 @@ def test_server_models(): assert res.status_code == 200 assert len(res.body["data"]) == 1 assert res.body["data"][0]["id"] == server.model_alias + +def test_load_split_model(): + global server + server.model_hf_repo = "ggml-org/models" + server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf" + server.model_alias = "tinyllama-split" + server.start() + res = server.make_request("POST", "/completion", data={ + "n_predict": 16, + "prompt": "Hello", + "temperature": 0.0, + }) + assert res.status_code == 200 + assert match_regex("(little|girl)+", res.body["content"]) diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index d7aeb288d..1048d6fca 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -127,3 +127,22 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int assert res.status_code != 200 assert "error" in res.body + +@pytest.mark.parametrize("messages", [ + None, + "string", + [123], + [{}], + [{"role": 123}], + [{"role": "system", "content": 123}], + # [{"content": "hello"}], # TODO: should not be a valid case + [{"role": "system", "content": "test"}, {}], +]) +def test_invalid_chat_completion_req(messages): + global server + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "messages": messages, + }) + assert res.status_code == 400 or res.status_code == 500 + assert "error" in res.body diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index 38ce6c429..6a6d40a1c 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -8,6 +8,7 @@ def create_server(): global server server = ServerPreset.tinyllama_infill() + def test_infill_without_input_extra(): global server server.start() @@ -19,6 +20,7 @@ def test_infill_without_input_extra(): assert res.status_code == 200 assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"]) + def test_infill_with_input_extra(): global server server.start() @@ -33,3 +35,23 @@ def test_infill_with_input_extra(): }) assert res.status_code == 200 assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"]) + + +@pytest.mark.parametrize("input_extra", [ + {}, + {"filename": "ok"}, + {"filename": 123}, + {"filename": 123, "text": "abc"}, + {"filename": 123, "text": 456}, +]) +def test_invalid_input_extra_req(input_extra): + global server + server.start() + res = server.make_request("POST", "/infill", data={ + "prompt": "Complete this", + "input_extra": [input_extra], + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_suffix": "}\n", + }) + assert res.status_code == 400 + assert "error" in res.body diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py index 3a49fd3ac..189bc4c96 100644 --- a/examples/server/tests/unit/test_rerank.py +++ b/examples/server/tests/unit/test_rerank.py @@ -36,3 +36,20 @@ def test_rerank(): assert most_relevant["relevance_score"] > least_relevant["relevance_score"] assert most_relevant["index"] == 2 assert least_relevant["index"] == 3 + + +@pytest.mark.parametrize("documents", [ + [], + None, + 123, + [1, 2, 3], +]) +def test_invalid_rerank_req(documents): + global server + server.start() + res = server.make_request("POST", "/rerank", data={ + "query": "Machine learning is", + "documents": documents, + }) + assert res.status_code == 400 + assert "error" in res.body diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py new file mode 100644 index 000000000..982d6abb4 --- /dev/null +++ b/examples/server/tests/unit/test_speculative.py @@ -0,0 +1,103 @@ +import pytest +from utils import * + +# We use a F16 MOE gguf as main model, and q4_0 as draft model + +server = ServerPreset.stories15m_moe() + +MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf" + +def create_server(): + global server + server = ServerPreset.stories15m_moe() + # download draft model file if needed + file_name = MODEL_DRAFT_FILE_URL.split('/').pop() + model_draft_file = f'../../../{file_name}' + if not os.path.exists(model_draft_file): + print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}") + with open(model_draft_file, 'wb') as f: + f.write(requests.get(MODEL_DRAFT_FILE_URL).content) + print(f"Done downloading draft model file") + # set default values + server.model_draft = model_draft_file + server.draft_min = 4 + server.draft_max = 8 + + +@pytest.fixture(scope="module", autouse=True) +def fixture_create_server(): + return create_server() + + +def test_with_and_without_draft(): + global server + server.model_draft = None # disable draft model + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "temperature": 0.0, + "top_k": 1, + }) + assert res.status_code == 200 + content_no_draft = res.body["content"] + server.stop() + + # create new server with draft model + create_server() + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "temperature": 0.0, + "top_k": 1, + }) + assert res.status_code == 200 + content_draft = res.body["content"] + + assert content_no_draft == content_draft + + +def test_different_draft_min_draft_max(): + global server + test_values = [ + (1, 2), + (1, 4), + (4, 8), + (4, 12), + (8, 16), + ] + last_content = None + for draft_min, draft_max in test_values: + server.stop() + server.draft_min = draft_min + server.draft_max = draft_max + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "temperature": 0.0, + "top_k": 1, + }) + assert res.status_code == 200 + if last_content is not None: + assert last_content == res.body["content"] + last_content = res.body["content"] + + +@pytest.mark.parametrize("n_slots,n_requests", [ + (1, 2), + (2, 2), +]) +def test_multi_requests_parallel(n_slots: int, n_requests: int): + global server + server.n_slots = n_slots + server.start() + tasks = [] + for _ in range(n_requests): + tasks.append((server.make_request, ("POST", "/completion", { + "prompt": "I believe the meaning of life is", + "temperature": 0.0, + "top_k": 1, + }))) + results = parallel_function_calls(tasks) + for res in results: + assert res.status_code == 200 + assert match_regex("(wise|kind|owl|answer)+", res.body["content"]) diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index a831f113f..e17a05ff6 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -46,6 +46,7 @@ class ServerProcess: model_alias: str | None = None model_url: str | None = None model_file: str | None = None + model_draft: str | None = None n_threads: int | None = None n_gpu_layer: int | None = None n_batch: int | None = None @@ -68,6 +69,8 @@ class ServerProcess: response_format: str | None = None lora_files: List[str] | None = None disable_ctx_shift: int | None = False + draft_min: int | None = None + draft_max: int | None = None # session variables process: subprocess.Popen | None = None @@ -102,6 +105,8 @@ class ServerProcess: server_args.extend(["--model", self.model_file]) if self.model_url: server_args.extend(["--model-url", self.model_url]) + if self.model_draft: + server_args.extend(["--model-draft", self.model_draft]) if self.model_hf_repo: server_args.extend(["--hf-repo", self.model_hf_repo]) if self.model_hf_file: @@ -147,6 +152,10 @@ class ServerProcess: server_args.extend(["--no-context-shift"]) if self.api_key: server_args.extend(["--api-key", self.api_key]) + if self.draft_max: + server_args.extend(["--draft-max", self.draft_max]) + if self.draft_min: + server_args.extend(["--draft-min", self.draft_min]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") @@ -185,7 +194,8 @@ class ServerProcess: raise TimeoutError(f"Server did not start within {timeout_seconds} seconds") def stop(self) -> None: - server_instances.remove(self) + if self in server_instances: + server_instances.remove(self) if self.process: print(f"Stopping server with pid={self.process.pid}") self.process.kill()