From 37b9f0d29d7301dd0ec5dfae8f357ccee96325d7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 19 Apr 2025 09:15:45 +0200 Subject: [PATCH 1/5] clip : refactor, add `image_manipulation` and `llava_uhd` classes (#13011) * clip : refactor, add `image_manipulation` and `llava_uhd` * refactor llava-1.6 preprocessing * simplify logic for llava-1.5 * missing include --- examples/llava/clip.cpp | 922 ++++++++++++++++++++-------------------- 1 file changed, 464 insertions(+), 458 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 49c90b750..759706156 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -27,6 +27,7 @@ #include #include #include +#include struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL}; @@ -1680,45 +1681,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length return true; } -// Linear interpolation between two points -inline float clip_lerp(float s, float e, float t) { - return s + (e - s) * t; -} -// Bilinear resize function -static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float x_ratio = static_cast(src.nx - 1) / target_width; - float y_ratio = static_cast(src.ny - 1) / target_height; - - for (int y = 0; y < target_height; y++) { - for (int x = 0; x < target_width; x++) { - float px = x_ratio * x; - float py = y_ratio * y; - int x_floor = static_cast(px); - int y_floor = static_cast(py); - float x_lerp = px - x_floor; - float y_lerp = py - y_floor; - - for (int c = 0; c < 3; c++) { - float top = clip_lerp( - static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), - static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - float bottom = clip_lerp( - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), - static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), - x_lerp - ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(clip_lerp(top, bottom, y_lerp)); - } - } - } -} - // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { dst.nx = src.nx; @@ -1732,336 +1694,489 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 } } -inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} +// set of tools to manupulate images +// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv +struct image_manipulation { + // Bilinear resize function + static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); -static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; - float Cc; - float C[5]; - float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, jj; - int x, y; - float dx, dy; - float tx, ty; - - tx = (float)nx / (float)target_width; - ty = (float)ny / (float)target_height; - - // Bicubic interpolation; adapted from ViT.cpp, inspired from : - // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 - // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - - for (i = 0; i < target_height; i++) { - for (j = 0; j < target_width; j++) { - x = (int)(tx * j); - y = (int)(ty * i); - - dx = tx * j - x; - dy = ty * i - y; - - for (k = 0; k < 3; k++) { - for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - - C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; - - d0 = C[0] - C[1]; - d2 = C[2] - C[1]; - d3 = C[3] - C[1]; - a0 = C[1]; - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; - - const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); } } } } - return true; -} + // Bicubic resize function + // part of image will be cropped if the aspect ratio is different + static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; -// llava-1.6 type of resize_and_pad (black) -static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair& target_resolution) { - int target_width = target_resolution.first; - int target_height = target_resolution.second; + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); - float scale_w = static_cast(target_width) / image.nx; - float scale_h = static_cast(target_height) / image.ny; + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; - int new_width, new_height; + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; - if (scale_w < scale_h) { - new_width = target_width; - new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); - } else { - new_height = target_height; - new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; } - clip_image_u8 resized_image; - // bilinear_resize(image, resized_image, new_width, new_height); - bicubic_resize(image, resized_image, new_width, new_height); + // llava-1.6 type of resize_and_pad + // if the ratio is not 1:1, padding with pad_color will be applied + // pad_color is single channel, default is 0 (black) + static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array pad_color = {0, 0, 0}) { + int target_width = target_resolution.width; + int target_height = target_resolution.height; - clip_image_u8 padded_image; - padded_image.nx = target_width; - padded_image.ny = target_height; - padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; - // Calculate padding offsets - int pad_x = (target_width - new_width) / 2; - int pad_y = (target_height - new_height) / 2; + int new_width, new_height; - // Copy the resized image into the center of the padded buffer - for (int y = 0; y < new_height; ++y) { - for (int x = 0; x < new_width; ++x) { - for (int c = 0; c < 3; ++c) { - padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height); + + // Fill the padded image with the fill color + for (size_t i = 0; i < padded_image.buf.size(); i += 3) { + padded_image.buf[i] = pad_color[0]; + padded_image.buf[i + 1] = pad_color[1]; + padded_image.buf[i + 2] = pad_color[2]; + } + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + dst = std::move(padded_image); + } + + static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; } } } - image_output = std::move(padded_image); -} + +private: + static inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); + } + + // Linear interpolation between two points + static inline float lerp(float s, float e, float t) { + return s + (e - s) * t; + } +}; /** - * Selects the best resolution from a list of possible resolutions based on the original size. + * implementation of LLaVA-UHD: + * - https://arxiv.org/pdf/2403.11703 + * - https://github.com/thunlp/LLaVA-UHD + * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 * - * @param original_size The original size of the image in the format (width, height). - * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. - * @return The best fit resolution in the format (width, height). + * overview: + * - an image always have a single overview (downscaled image) + * - an image can have 0 or multiple slices, depending on the image size + * - each slice can then be considered as a separate image + * + * for example: + * + * [overview] --> [slice 1] --> [slice 2] + * | | + * +--> [slice 3] --> [slice 4] */ -static std::pair select_best_resolution(const std::pair & original_size, const std::vector> & possible_resolutions) { - int original_width = original_size.first; - int original_height = original_size.second; - std::pair best_fit; - int max_effective_resolution = 0; - int min_wasted_resolution = std::numeric_limits::max(); +struct llava_uhd { + struct slice_coordinates { + int x; + int y; + clip_image_size size; + }; - for (const auto& resolution : possible_resolutions) { - int width = resolution.first; - int height = resolution.second; - float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); - int downscaled_width = static_cast(original_width * scale); - int downscaled_height = static_cast(original_height * scale); - int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); - int wasted_resolution = (width * height) - effective_resolution; - // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); - if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { - max_effective_resolution = effective_resolution; - min_wasted_resolution = wasted_resolution; - best_fit = resolution; + struct slice_instructions { + clip_image_size overview_size; // size of downscaled image + clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) + clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices + std::vector slices; + bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + }; + + static int get_max_slices(struct clip_ctx * ctx) { + if (clip_is_minicpmv(ctx)) { + return 9; } + return 0; } - return best_fit; -} + static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + slice_instructions res; + const int patch_size = clip_get_patch_size(ctx); + const int slice_size = clip_get_image_size(ctx); + const int max_slice_nums = get_max_slices(ctx); + const int original_width = original_size.width; + const int original_height = original_size.height; + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + const bool has_slices = (multiple > 1); + const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty(); -static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { - std::vector patches; - int width = image.nx; - int height = image.ny; - for (int i = 0; i < height; i += patch_size) { - for (int j = 0; j < width; j += patch_size) { - clip_image_u8_ptr patch(clip_image_u8_init()); - patch->nx = std::min(patch_size, width - j); - patch->ny = std::min(patch_size, height - i); - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = 0; y < patch->ny; ++y) { - for (int x = 0; x < patch->nx; ++x) { - for (int c = 0; c < 3; ++c) { - patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; + if (has_pinpoints) { + // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) + auto refine_size = llava_uhd::select_best_resolution( + ctx->vision_model.hparams.image_grid_pinpoints, + original_size); + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + + for (int y = 0; y < refine_size.height; y += slice_size) { + for (int x = 0; x < refine_size.width; x += slice_size) { + slice_coordinates slice; + slice.x = x; + slice.y = y; + slice.size.width = std::min(slice_size, refine_size.width - x); + slice.size.height = std::min(slice_size, refine_size.height - y); + res.slices.push_back(slice); + if (x == 0) { + res.grid_size.width++; } } + res.grid_size.height++; } - patches.push_back(std::move(patch)); + + return res; } - } - return patches; -} -static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); -} + // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) -static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.first; - int height = original_size.second; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - int best_width = ensure_divide(width, patch_size); - int best_height = ensure_divide(height, patch_size); - return std::make_pair(best_width, best_height); -} + auto best_size = get_best_resize(original_size, slice_size, patch_size, has_slices); + res.overview_size = best_size; -static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width, height; - std::tie(width, height) = original_size; - int grid_x, grid_y; - std::tie(grid_x, grid_y) = grid; + if (!has_slices) { + // skip slicing logic + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); + } else { + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; - int grid_width = refine_width / grid_x; - int grid_height = refine_height / grid_y; - - // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - int best_grid_width, best_grid_height; - std::tie(best_grid_width, best_grid_height) = best_grid_size; - - // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) - std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) - return refine_size; -} - -static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - std::vector> candidate_grids; - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.emplace_back(m, split_grids_nums / m); - } - ++m; - } - } - - std::pair best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - return best_grid; -} - -// inspired from LLaVA-UHD: -// -> https://arxiv.org/pdf/2403.11703 -// -> https://github.com/thunlp/LLaVA-UHD -// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { - const std::pair original_size={img->nx,img->ny}; - const int original_width = img->nx; - const int original_height = img->ny; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - std::vector> images; - LOG_DBG("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); - - if (multiple <= 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8_ptr source_image(clip_image_u8_init()); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images.back().push_back(std::move(source_image)); - } - else if (multiple > 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8_ptr source_image(clip_image_u8_init()); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); - images.back().push_back(std::move(source_image)); - - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); - - auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8_ptr refine_image(clip_image_u8_init()); - bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - - LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); - - // split_to_patches - int width = refine_image->nx; - int height = refine_image->ny; - int grid_x = int(width / best_grid.first); - int grid_y = int(height / best_grid.second); - for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); - for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8_ptr patch(clip_image_u8_init()); - patch->nx = grid_x; - patch->ny = grid_y; - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = patches_i; y < patches_i + grid_y; ++y) { - for (int x = patches_j; x < patches_j + grid_x; ++x) { - const int i = 3 * (y * refine_image->nx + x); - const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); - patch->buf[j] = refine_image->buf[i]; - patch->buf[j+1] = refine_image->buf[i+1]; - patch->buf[j+2] = refine_image->buf[i+2]; - } + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y); } - images.back().push_back(std::move(patch)); } } - } - return images; -} + return res; + } + + static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + std::vector output; + + // resize to overview size + clip_image_u8_ptr resized_img(clip_image_u8_init()); + image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height); + output.push_back(std::move(resized_img)); + if (inst.slices.empty()) { + // no slices, just return the resized image + return output; + } + + // resize to refined size + clip_image_u8_ptr refined_img(clip_image_u8_init()); + if (inst.padding_refined) { + image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size); + } else { + image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height); + } + + // create slices + for (const auto & slice : inst.slices) { + int x = slice.x; + int y = slice.y; + int w = slice.size.width; + int h = slice.size.height; + + clip_image_u8_ptr img_slice(clip_image_u8_init()); + image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h); + output.push_back(std::move(img_slice)); + } + + return output; + } + +private: + static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + clip_image_size res; + res.width = ensure_divide(width, patch_size); + res.height = ensure_divide(height, patch_size); + return res; + } + + /** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image + * @param possible_resolutions A list of possible resolutions + * @return The best fit resolution + */ + static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { + int original_width = original_size.width; + int original_height = original_size.height; + clip_image_size best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto & resolution : possible_resolutions) { + int width = resolution.width; + int height = resolution.height; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; + } + + // used by llava 1.6 with custom list of pinpoints + static clip_image_size select_best_resolution(const std::vector & pinpoints, const clip_image_size & original_size) { + std::vector possible_resolutions; + for (size_t i = 0; i < pinpoints.size(); i += 2) { + possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]}); + } + return select_best_resolution(original_size, possible_resolutions); + } + + static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); + } + + static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + clip_image_size grid_size; + grid_size.width = refine_width / grid_x; + grid_size.height = refine_height / grid_y; + + auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); + int best_grid_width = best_grid_size.width; + int best_grid_height = best_grid_size.height; + + clip_image_size refine_size; + refine_size.width = best_grid_width * grid_x; + refine_size.height = best_grid_height * grid_y; + return refine_size; + } + + static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); + } + ++m; + } + } + + clip_image_size best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; + } +}; + +// TODO @ngxson : decprecate the load_image_size singleton pattern int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { - const int max_slice_nums=9; - const int scale_resolution=448; - const int original_width = ctx_clip->load_image_size.width; - const int original_height = ctx_clip->load_image_size.height; - const float log_ratio = log(1.0*original_width/original_height); - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - return best_grid.first; + const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size); + return inst.grid_size.width; } // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { + if (!ctx->has_vision_encoder) { + LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); + return false; + } + + clip_image_size original_size{img->nx, img->ny}; + bool pad_to_square = true; + auto & params = ctx->vision_model.hparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { + pad_to_square = false; + } if (clip_is_minicpmv(ctx)) { - int max_slice_nums = 9; - std::vector> imgs = uhd_slice_image(img, max_slice_nums); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(res)); - } + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); } return true; } @@ -2070,7 +2185,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size; - bicubic_resize(*img, resized, nx, ny); + image_manipulation::bicubic_resize(*img, resized, nx, ny); clip_image_f32_ptr img_f32(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init()); @@ -2082,8 +2197,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { clip_image_u8 resized_image; - int32_t sz=ctx->vision_model.hparams.image_size; - bicubic_resize(*img, resized_image,sz,sz); + int sz = params.image_size; + image_manipulation::bicubic_resize(*img, resized_image, sz, sz); clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); @@ -2091,156 +2206,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str return true; } - bool pad_to_square = true; - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - auto & params = ctx->vision_model.hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { - pad_to_square = false; - } - // free the previous res_imgs if any set - res_imgs->entries.clear(); - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily - if (pad_to_square && img->nx != img->ny) { - int longer_side = std::max(img->nx, img->ny); + + if (pad_to_square) { + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); temp->nx = longer_side; temp->ny = longer_side; temp->buf.resize(3 * longer_side * longer_side); - const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) - // fill with background color - for (size_t i = 0; i < temp->buf.size(); i++) { - temp->buf[i] = bc[i % 3]; + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array pad_color = {122, 116, 104}; + + // resize the image to the target_size + image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color); + + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); + return true; + + } else if (!params.image_grid_pinpoints.empty()) { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std); + res_imgs->entries.push_back(std::move(res)); } - // copy from the input image - for (int y = 0; y < img->ny; y++) { - for (int x = 0; x < img->nx; x++) { - const int i = 3 * (y * img->nx + x); - const int j = 3 * (y * temp->nx + x); - temp->buf[j] = img->buf[i]; - temp->buf[j+1] = img->buf[i+1]; - temp->buf[j+2] = img->buf[i+2]; - } - } - } else { - if (!params.image_grid_pinpoints.empty()) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - std::vector> possible_resolutions; - for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) { - possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); - } - std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); - // clip_image_save_to_bmp(*img, "input.bmp"); - resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 - // clip_image_save_to_bmp(*temp, "resized.bmp"); - // visually verify normalized image: - // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); - // { - // clip_image_u8 * temp2 = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*res, *temp2); - // clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp"); - // clip_image_u8_free(temp2); - // } + return true; - std::vector patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) - - clip_image_u8_ptr image_original_resize(clip_image_u8_init()); - // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - patches.insert(patches.begin(), std::move(image_original_resize)); - for (auto & patch : patches) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std); - res_imgs->entries.push_back(std::move(res)); - } - - return true; - } else { - temp->nx = img->nx; - temp->ny = img->ny; - temp->buf.resize(img->buf.size()); - memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); - } } - const int nx = temp->nx; - const int ny = temp->ny; - // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); - - const int nx2 = ctx->vision_model.hparams.image_size; - const int ny2 = ctx->vision_model.hparams.image_size; - clip_image_f32_ptr res(clip_image_f32_init()); - res->nx = nx2; - res->ny = ny2; - res->buf.resize(3 * nx2 * ny2); - - const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; - - const int nx3 = int(nx / scale + 0.5f); - const int ny3 = int(ny / scale + 0.5f); - - const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; - const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; - - for (int y = 0; y < ny3; y++) { - for (int x = 0; x < nx3; x++) { - for (int c = 0; c < 3; c++) { - // linear interpolation - const float sx = (x + 0.5f) * scale - 0.5f; - const float sy = (y + 0.5f) * scale - 0.5f; - - const int x0 = std::max(0, (int)std::floor(sx)); - const int y0 = std::max(0, (int)std::floor(sy)); - - const int x1 = std::min(x0 + 1, nx - 1); - const int y1 = std::min(y0 + 1, ny - 1); - - const float dx = sx - x0; - const float dy = sy - y0; - - const int j00 = 3 * (y0 * nx + x0) + c; - const int j01 = 3 * (y0 * nx + x1) + c; - const int j10 = 3 * (y1 * nx + x0) + c; - const int j11 = 3 * (y1 * nx + x1) + c; - - const float v00 = temp->buf[j00]; - const float v01 = temp->buf[j01]; - const float v10 = temp->buf[j10]; - const float v11 = temp->buf[j11]; - - const float v0 = v00 * (1.0f - dx) + v01 * dx; - const float v1 = v10 * (1.0f - dx) + v11 * dx; - - const float v = v0 * (1.0f - dy) + v1 * dy; - - const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); - - const int i = 3 * (y * nx3 + x) + c; - - res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; - } - } - } - - // { - // clip_image_u8 * temp2 = clip_image_u8_init(); - // clip_image_convert_f32_to_u8(*res, *temp2); - // clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); - // clip_image_u8_free(temp2); - // } - // res_imgs.push_back(res); - - res_imgs->entries.push_back(std::move(res)); - - return true; + GGML_ASSERT(false && "Unknown image preprocessing type"); } ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { From fb28f4f80efb5a2990a6a240433b02e259b0dbb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 19 Apr 2025 16:26:38 +0200 Subject: [PATCH 2/5] gguf-py : fix upload python package workflow (#13020) --- gguf-py/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9745d3ea8..0c8272567 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.16.1" +version = "0.16.2" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ @@ -23,7 +23,7 @@ numpy = ">=1.17" tqdm = ">=4.27" pyyaml = ">=5.1" sentencepiece = ">=0.1.98,<=0.2.0" -PySide6 = { version = "^6.9", optional = true } +PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true } [tool.poetry.dev-dependencies] pytest = "^5.2" From 00137157fca3d17b90380762b4d7cc158d385bd3 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:05:03 -0300 Subject: [PATCH 3/5] Disable CI cross-compile builds (#13022) --- .github/workflows/build.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 34417985d..32c8b7717 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -601,8 +601,9 @@ jobs: -DGGML_SYCL_F16=ON cmake --build build --config Release -j $(nproc) - build-linux-cross: - uses: ./.github/workflows/build-linux-cross.yml +# Disabled for now due to sporadic issue syncing. +# build-linux-cross: +# uses: ./.github/workflows/build-linux-cross.yml macOS-latest-cmake-ios: runs-on: macos-latest From 4ba9d711ba0a5bf0be00936d3258d4a5e4164d4f Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 19 Apr 2025 22:28:40 -0700 Subject: [PATCH 4/5] metal: add neg operator (#13029) --- ggml/src/ggml-metal/ggml-metal.m | 15 +++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 85f3ae7bf..266d8af46 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -481,6 +481,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_SQRT, GGML_METAL_KERNEL_TYPE_SIN, GGML_METAL_KERNEL_TYPE_COS, + GGML_METAL_KERNEL_TYPE_NEG, GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, @@ -1159,6 +1160,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true); @@ -1320,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_NEG: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; default: return false; @@ -2010,6 +2013,18 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_UNARY_OP_NEG: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; default: { GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index dc7eab03e..8d6e99e62 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -949,6 +949,13 @@ kernel void kernel_cos( dst[tpig] = cos(src0[tpig]); } +kernel void kernel_neg( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = -src0[tpig]; +} + kernel void kernel_sum_rows( device const float * src0, device float * dst, From 66168204be9559dc841f06f0025f3da01d9a8546 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 20 Apr 2025 03:50:02 -0500 Subject: [PATCH 5/5] vulkan: support noncontiguous rms_norm (#13031) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 22 ++++++++++--- .../ggml-vulkan/vulkan-shaders/rms_norm.comp | 32 ++++++++++++------- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2f6d03c93..39f3cd343 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2397,7 +2397,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -6006,6 +6006,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: return true; default: return false; @@ -6216,7 +6217,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co switch (op) { case GGML_OP_NORM: - case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: case GGML_OP_L2_NORM: case GGML_OP_SOFT_MAX: @@ -6233,6 +6233,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co elements = { nr, 1, 1 }; } } break; + case GGML_OP_RMS_NORM: + elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; + break; + case GGML_OP_SUM: // We use GGML_OP_SUM_ROWS with 1 row. elements = { 1, 1, 1 }; @@ -6883,7 +6887,17 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + op_params[0], 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, dryrun); } static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -9388,10 +9402,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: + case GGML_OP_RMS_NORM: return true; case GGML_OP_NORM: case GGML_OP_GROUP_NORM: - case GGML_OP_RMS_NORM: case GGML_OP_L2_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_ADD: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index b554400ba..deb8ee996 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -1,6 +1,6 @@ #version 450 -#include "generic_head.comp" +#include "generic_unary_head.comp" #include "types.comp" #extension GL_EXT_control_flow_attributes : enable @@ -8,19 +8,29 @@ layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - shared FLOAT_TYPE sum[BLOCK_SIZE]; void main() { - const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; - const uint tid = gl_LocalInvocationID.x; + const uint ncols = p.ne00; + const uint nrows = gl_NumWorkGroups.x; + const uint nchannels = gl_NumWorkGroups.y; + + const uint row = gl_WorkGroupID.x; + const uint channel = gl_WorkGroupID.y; + const uint samp = gl_WorkGroupID.z; + const uint tid = gl_LocalInvocationID.x; + + const uint stride_row = p.nb01; + const uint stride_channel = p.nb02; + const uint stride_sample = p.nb03; + + uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset(); + uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]); + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_offset + col]); sum[tid] += xi * xi; } @@ -33,10 +43,10 @@ void main() { barrier(); } - const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX); + const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols); const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col])); + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); } }