diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 12ecd6451..31a11cbec 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3159,7 +3159,7 @@ class LLaDAModel(TextModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Ernie4_5_ForCausalLM") +@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") class Ernie4_5Model(TextModel): model_arch = gguf.MODEL_ARCH.ERNIE4_5 @@ -6254,9 +6254,11 @@ class DeepseekModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("DeepseekV2ForCausalLM") -@ModelBase.register("DeepseekV3ForCausalLM") -@ModelBase.register("KimiVLForConditionalGeneration") +@ModelBase.register( + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "KimiVLForConditionalGeneration", +) class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -8507,6 +8509,43 @@ class PixtralModel(LlavaVisionModel): return "mm.2.weight" return super().map_tensor_name(name, try_suffixes) + +@ModelBase.register("KimiVLForConditionalGeneration") +class KimiVLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 64 * 14 # for compatibility + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_projector_scale_factor(2) + # eps is the same as pytorch's default value + assert self.hparams_vision is not None + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if is_vision_tensor: + if "pos_emb.weight" in name: + data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) + elif "wqkv" in name: + split_dim = 0 if "weight" in name else -1 + wq, wk, wv = data_torch.chunk(3, dim=split_dim) + return [ + (self.map_tensor_name(name.replace("wqkv", "wq")), wq), + (self.map_tensor_name(name.replace("wqkv", "wk")), wk), + (self.map_tensor_name(name.replace("wqkv", "wv")), wv) + ] + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + ###### CONVERSION LOGIC ###### diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index f6f8590b2..0795d229c 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -424,16 +424,28 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { template static __device__ __forceinline__ int warp_reduce_all(int x) { -#ifdef GGML_USE_HIP + if (width == ggml_cuda_get_physical_warp_size()) { + return __all_sync(0xffffffff, x); + } else { #pragma unroll - for (int offset = width/2; offset > 0; offset >>= 1) { - x = x && __shfl_xor_sync(0xffffffff, x, offset, width); + for (int offset = width/2; offset > 0; offset >>= 1) { + x = __shfl_xor_sync(0xffffffff, x, offset, width) && x; + } + return x; + } +} + +template +static __device__ __forceinline__ int warp_reduce_any(int x) { + if (width == ggml_cuda_get_physical_warp_size()) { + return __any_sync(0xffffffff, x); + } else { +#pragma unroll + for (int offset = width/2; offset > 0; offset >>= 1) { + x = __shfl_xor_sync(0xffffffff, x, offset, width) || x; + } + return x; } - return x; -#else - static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented"); - return __all_sync(0xffffffff, x); -#endif // GGML_USE_HIP } template diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9ba260f36..5e5207a30 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -207,6 +207,8 @@ static ggml_cuda_device_info ggml_cuda_init() { //#endif // GGML_CUDA_FORCE_CUBLAS GGML_LOG_INFO("---\nInitializing CUDA/HIP, please wait, the following step may take a few minutes (only for first launch)...\n---\n"); GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); + + std::vector> turing_devices_without_mma; for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; @@ -264,7 +266,25 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc = 100*prop.major + 10*prop.minor; GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); -#endif // defined(GGML_USE_HIP) + std::string device_name(prop.name); + if (device_name == "NVIDIA GeForce MX450") { + turing_devices_without_mma.push_back({ id, device_name }); + } else if (device_name == "NVIDIA GeForce MX550") { + turing_devices_without_mma.push_back({ id, device_name }); + } else if (device_name.substr(0, 21) == "NVIDIA GeForce GTX 16") { + turing_devices_without_mma.push_back({ id, device_name }); + } +#endif // defined(GGML_USE_HIP) + } + + if (ggml_cuda_highest_compiled_arch(GGML_CUDA_CC_TURING) >= GGML_CUDA_CC_TURING && !turing_devices_without_mma.empty()) { + GGML_LOG_INFO("The following devices will have suboptimal performance due to a lack of tensor cores:\n"); + for (size_t device_pos = 0; device_pos < turing_devices_without_mma.size(); device_pos++) { + GGML_LOG_INFO( + " Device %d: %s\n", turing_devices_without_mma[device_pos].first, turing_devices_without_mma[device_pos].second.c_str()); + } + GGML_LOG_INFO( + "Consider compiling with CMAKE_CUDA_ARCHITECTURES=61-virtual;80-virtual and DGGML_CUDA_FORCE_MMQ to force the use of the Pascal code for Turing.\n"); } for (int id = 0; id < info.device_count; ++id) { diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 8df261579..f2892bd99 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -3,6 +3,140 @@ #include +// To reduce shared memory use, store "it" and "iex_used" with 22/10 bits each. +struct mmq_ids_helper_store { + uint32_t data; + + __device__ mmq_ids_helper_store(const uint32_t it, const uint32_t iex_used) { + data = (it & 0x003FFFFF) | (iex_used << 22); + } + + __device__ uint32_t it() const { + return data & 0x003FFFFF; + } + + __device__ uint32_t iex_used() const { + return data >> 22; + } +}; +static_assert(sizeof(mmq_ids_helper_store) == 4, "unexpected size for mmq_ids_helper_store"); + +// Helper function for mul_mat_id, converts ids to a more convenient format. +// ids_src1 describes how to permute the flattened column indices of src1 in order to get a compact src1 tensor sorted by expert. +// ids_dst describes the same mapping but for the dst tensor. +// The upper and lower bounds for the ith expert in the compact src1 tensor are stored in expert_bounds[i:i+1]. +template +__launch_bounds__(ggml_cuda_get_physical_warp_size(), 1) +static __global__ void mmq_ids_helper( + const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds, + const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1) { + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + const int n_expert_used = n_expert_used_template == 0 ? n_expert_used_var : n_expert_used_template; + const int expert = blockIdx.x; + + extern __shared__ char data_mmq_ids_helper[]; + mmq_ids_helper_store * store = (mmq_ids_helper_store *) data_mmq_ids_helper; + + int nex_prev = 0; // Number of columns for experts with a lower index. + int it_compact = 0; // Running index for the compact slice of this expert. + + if constexpr (n_expert_used_template == 0) { + // Generic implementation: + for (int it = 0; it < n_tokens; ++it) { + int iex_used = -1; // The index at which the expert is used, if any. + for (int iex = threadIdx.x; iex < n_expert_used; iex += warp_size) { + const int expert_used = ids[it*si1 + iex]; + nex_prev += expert_used < expert; + if (expert_used == expert) { + iex_used = iex; + } + } + + if (iex_used != -1) { + store[it_compact] = mmq_ids_helper_store(it, iex_used); + } + + if (warp_reduce_any(iex_used != -1)) { + it_compact++; + } + } + } else { + // Implementation optimized for specific numbers of experts used: + static_assert(n_expert_used == 6 || warp_size % n_expert_used == 0, "bad n_expert_used"); + const int neu_padded = n_expert_used == 6 ? 8 : n_expert_used; // Padded to next higher power of 2. + for (int it0 = 0; it0 < n_tokens; it0 += warp_size/neu_padded) { + const int it = it0 + threadIdx.x / neu_padded; + + const int iex = threadIdx.x % neu_padded; // The index at which the expert is used, if any. + const int expert_used = (neu_padded == n_expert_used || iex < n_expert_used) && it < n_tokens ? + ids[it*si1 + iex] : INT_MAX; + const int iex_used = expert_used == expert ? iex : -1; + nex_prev += expert_used < expert; + + // Whether the threads at this token position have used the expert: + const int it_compact_add_self = warp_reduce_any(iex_used != -1); + + // Do a scan over threads at lower token positions in warp to get the correct index for writing data: + int it_compact_add_lower = 0; +#pragma unroll + for (int offset = neu_padded; offset < warp_size; offset += neu_padded) { + const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size); + if (threadIdx.x >= offset) { + it_compact_add_lower += tmp; + } + } + + if (iex_used != -1) { + store[it_compact + it_compact_add_lower] = mmq_ids_helper_store(it, iex_used); + } + + // The thread with the highest index in the warp always has the sum over the whole warp, use it to increment all threads: + it_compact += __shfl_sync(0xFFFFFFFF, it_compact_add_lower + it_compact_add_self, warp_size - 1, warp_size); + } + } + nex_prev = warp_reduce_sum(nex_prev); + + for (int itc = threadIdx.x; itc < it_compact; itc += warp_size) { + const mmq_ids_helper_store store_it = store[itc]; + const int it = store_it.it(); + const int iex_used = store_it.iex_used(); + ids_src1[nex_prev + itc] = it*sis1 + iex_used % nchannels_y; + ids_dst [nex_prev + itc] = it*n_expert_used + iex_used; + } + + if (threadIdx.x != 0) { + return; + } + + expert_bounds[expert] = nex_prev; + + if (expert < gridDim.x - 1) { + return; + } + + expert_bounds[gridDim.x] = nex_prev + it_compact; +} + +template +static void launch_mmq_ids_helper( + const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds, + const int n_experts, const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1, cudaStream_t stream) { + GGML_ASSERT(n_tokens < (1 << 22) && "too few bits in mmq_ids_helper_store"); + GGML_ASSERT(n_expert_used_var < (1 << 10) && "too few bits in mmq_ids_helper_store"); + + const int id = ggml_cuda_get_device(); + const int warp_size = ggml_cuda_info().devices[id].warp_size; + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; + CUDA_SET_SHARED_MEMORY_LIMIT(mmq_ids_helper, smpbo); + + const dim3 num_blocks(n_experts, 1, 1); + const dim3 block_size(warp_size, 1, 1); + const size_t nbytes_shared = n_tokens*sizeof(mmq_ids_helper_store); + GGML_ASSERT(nbytes_shared <= smpbo); + mmq_ids_helper<<>> + (ids, ids_src1, ids_dst, expert_bounds, n_tokens, n_expert_used_var, nchannels_y, si1, sis1); +} + static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { case GGML_TYPE_Q4_0: @@ -137,7 +271,7 @@ void ggml_cuda_mul_mat_q( ne00, ne01, ne1, s01, ne11, s1, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, - use_stream_k}; + use_stream_k, ne1}; ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); return; } @@ -148,54 +282,50 @@ void ggml_cuda_mul_mat_q( const int64_t n_expert_used = ids->ne[0]; const int64_t ne_get_rows = ne12 * n_expert_used; + GGML_ASSERT(ne1 == n_expert_used); - std::vector ids_host(ggml_nbytes(ids)); - std::vector ids_src1_host; - ids_src1_host.reserve(ne_get_rows); - std::vector ids_dst_host; - ids_dst_host.reserve(ne_get_rows); - std::vector tokens_per_expert_host(ne02); - std::vector expert_bounds_host(ne02 + 1); - ggml_cuda_pool_alloc ids_buf_dev(ctx.pool()); + ggml_cuda_pool_alloc ids_src1(ctx.pool(), ne_get_rows); + ggml_cuda_pool_alloc ids_dst(ctx.pool(), ne_get_rows); + ggml_cuda_pool_alloc expert_bounds(ctx.pool(), ne02 + 1); - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); + { + GGML_ASSERT(ids->nb[0] == ggml_element_size(ids)); + const int si1 = ids->nb[1] / ggml_element_size(ids); + const int sis1 = nb12 / nb11; - for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices - for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens - for (int64_t iex = 0; iex < n_expert_used; ++iex) { - const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]); - assert(expert_to_use >= 0 && expert_to_use < ne02); - if (expert_to_use == i02) { - ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11); - ids_dst_host.push_back(i12*ne1 + iex); - tokens_per_expert_host[i02]++; - break; - } - } + switch (n_expert_used) { + case 2: + launch_mmq_ids_helper< 2> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + case 4: + launch_mmq_ids_helper< 4> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + case 6: + launch_mmq_ids_helper< 6> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + case 8: + launch_mmq_ids_helper< 8> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + case 16: + launch_mmq_ids_helper<16> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + case 32: + launch_mmq_ids_helper<32> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; + default: + launch_mmq_ids_helper< 0> ((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(), + ne02, ne12, n_expert_used, ne11, si1, sis1, stream); + break; } + CUDA_CHECK(cudaGetLastError()); } - int32_t cumsum = 0; - for (int64_t i = 0; i < ne02; ++i) { - expert_bounds_host[i] = cumsum; - cumsum += tokens_per_expert_host[i]; - } - expert_bounds_host[ne02] = cumsum; - - std::vector ids_buf_host; - ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size()); - ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end()); - ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end()); - ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end()); - ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device. - CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - const int32_t * ids_src1_dev = ids_buf_dev.ptr; - const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size(); - const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size(); - const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 + get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq); ggml_cuda_pool_alloc src1_q8_1(ctx.pool(), nbytes_src1_q8_1); @@ -208,7 +338,7 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[2] / ts_src1; - quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type, + quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); CUDA_CHECK(cudaGetLastError()); } @@ -218,11 +348,11 @@ void ggml_cuda_mul_mat_q( // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. const mmq_args args = { - src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d, + src0_d, src0->type, (const int *) src1_q8_1.get(), ids_dst.get(), expert_bounds.get(), dst_d, ne00, ne01, ne_get_rows, s01, ne_get_rows, s1, ne02, ne02, s02, s12, s2, ne03, ne13, s03, s13, s3, - use_stream_k}; + use_stream_k, ne12}; ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); } @@ -262,7 +392,7 @@ void ggml_cuda_op_mul_mat_q( ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, - use_stream_k}; + use_stream_k, src1_ncols}; ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 3b4acd1d1..b2cbfdb77 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -3139,7 +3139,8 @@ static __global__ void mul_mat_q( const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst, const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { + const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + const int ncols_max) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -3153,7 +3154,7 @@ static __global__ void mul_mat_q( constexpr int qk = ggml_cuda_type_traits::qk; constexpr int mmq_y = get_mmq_y_device(); - const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x + const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y // Initialize the ids for writing back data with just the index. @@ -3377,7 +3378,8 @@ template static __global__ void mul_mat_q_stream_k_fixup( const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst, - const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) { + const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst, + const int ncols_max) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -3388,7 +3390,7 @@ static __global__ void mul_mat_q_stream_k_fixup( float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f}; - const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; + const int ntx = (ncols_max + mmq_x - 1) / mmq_x; const int nty = (nrows_x + mmq_y - 1) / mmq_y; const int bidx0 = blockIdx.x; @@ -3529,7 +3531,7 @@ struct mmq_args { int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst; int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst; int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst; - bool use_stream_k; + bool use_stream_k; int64_t ncols_max; }; template @@ -3559,7 +3561,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; - const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x; + const int ntx = (args.ncols_max + mmq_x - 1) / mmq_x; const int ntzw = args.nchannels_y * args.nsamples_y; const dim3 block_nums_xy_tiling(nty, ntx, ntzw); @@ -3575,14 +3577,16 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, - sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, + args.ncols_max); } else { constexpr bool need_check = true; mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, - sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, + args.ncols_max); } return; } @@ -3602,7 +3606,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, - sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, + args.ncols_max); if (!fixup_needed) { return; @@ -3610,14 +3615,16 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a mul_mat_q_stream_k_fixup<<>> (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, - args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst, + args.ncols_max); } else { constexpr bool need_check = true; mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, - sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst); + sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, + args.ncols_max); if (!fixup_needed) { return; @@ -3625,7 +3632,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a mul_mat_q_stream_k_fixup<<>> (args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst, - args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst); + args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst, + args.ncols_max); } } @@ -3650,7 +3658,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda continue; } - const int ntiles_x = (args.ncols_y + mmq_x - 1) / mmq_x; + const int ntiles_x = (args.ncols_max + mmq_x - 1) / mmq_x; if (ntiles_x < ntiles_x_best) { mmq_x_best = mmq_x; diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index d60292b83..6baab1176 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -28,7 +28,58 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32 return ((const int *) x)[i32]; // assume at least 4 byte alignment } +// q4 contains 8 indices with 4 bit each. +// This function selects those bytes from table that are at those indices and returns them as int2. +// The first int contains the bytes with even indices in q4, the second int contains the bytes with odd indices in q4. static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, const int8_t * table) { +#if defined(GGML_USE_HIP) + // Load the 16-byte table into four 32-bit unsigned integers. + const uint32_t *values = (const uint32_t *)table; + + const uint32_t q_even = q4; + const uint32_t q_odd = (q4 >> 4); + + // Perform lookups in the lower half of the table (indices 0-7). + uint32_t v_even_low = __builtin_amdgcn_perm(values[1], values[0], q_even & 0x07070707); + uint32_t v_odd_low = __builtin_amdgcn_perm(values[1], values[0], q_odd & 0x07070707); + + // Perform lookups in the upper half of the table (indices 8-15). + uint32_t v_even_high = __builtin_amdgcn_perm(values[3], values[2], q_even & 0x07070707); + uint32_t v_odd_high = __builtin_amdgcn_perm(values[3], values[2], q_odd & 0x07070707); + + // Select between the low and high results based on the MSB of each index nibble. + uint32_t mask_even = 0x03020100 | ((q_even & 0x08080808) >> 1); + uint32_t res_x = __builtin_amdgcn_perm(v_even_high, v_even_low, mask_even); + uint32_t mask_odd = 0x03020100 | ((q_odd & 0x08080808) >> 1); + uint32_t res_y = __builtin_amdgcn_perm(v_odd_high, v_odd_low, mask_odd); + + return make_int2(res_x, res_y); +#elif !defined(GGML_USE_MUSA) + // CUDA does not have an instruction for selecting bytes with 4 bit indices. + // However, __byte_perm is an instruction that selects bytes with 3 bit indices that can be used instead. + const uint32_t * table32 = (const uint32_t *) table; + + // __byte_perm selects bytes based on the lower 16 bits in its third argument. + // Therefore, do 2 iterations over the 32 bits in q4 with 0 and 16 shift. + // To handle the fourth bit, first call _byte_perm both for the low and the high 64 bit of table, using the low 3 bits. + // Then, call __byte_perm again to select from the low and high bytes based on the fourth bit. + uint32_t tmp[2]; + const uint32_t low_high_selection_indices = (0x32103210 | ((q4 & 0x88888888) >> 1)); +#pragma unroll + for (uint32_t i = 0; i < 2; ++i) { + const uint32_t shift = 16 * i; + + const uint32_t low = __byte_perm(table32[0], table32[1], q4 >> shift); + const uint32_t high = __byte_perm(table32[2], table32[3], q4 >> shift); + tmp[i] = __byte_perm(low, high, low_high_selection_indices >> shift); + } + + // tmp contains the bytes from tyble in the same order as the 4 bit indices in q4. + // However, for the result we need ints with all even/odd 4 bit indices in q4. + // Therefore, 2 more calls to __byte_perm to put the bytes in the correct order. + return make_int2(__byte_perm(tmp[0], tmp[1], 0x6420), __byte_perm(tmp[0], tmp[1], 0x7531)); +#else + // Generic implementation. const int q0_32 = (q4 >> 0) & 0x0F0F0F0F; const int8_t * q0_8 = (const int8_t *) &q0_32; const char4 val0_8 = make_char4( @@ -40,6 +91,7 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con table[q1_8[0]], table[q1_8[1]], table[q1_8[2]], table[q1_8[3]]); return make_int2(*((const int *) &val0_8), *((const int *) &val1_8)); +#endif } // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 6e9c67aca..c6a33d5de 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -22,7 +22,10 @@ #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }} #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width) +#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width) #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define __all_sync(mask, var) __all(var) +#define __any_sync(mask, var) __any(var) #define cublasCreate hipblasCreate #define cublasDestroy hipblasDestroy #define cublasGemmEx hipblasGemmEx diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index fc6526d6d..82c1ac1da 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -320,40 +320,31 @@ typedef struct { } ggml_metal_kargs_mul_mv_ext; typedef struct { + int32_t ne02; int32_t ne10; int32_t ne11; // n_expert_used (bcast) uint64_t nb11; uint64_t nb12; - int32_t neh11; // n_tokens - uint64_t nbh11; + int32_t ne21; // n_tokens int32_t ne20; // n_expert_used uint64_t nb21; } ggml_metal_kargs_mul_mm_id_map0; -typedef struct { - int32_t ne20; // n_expert_used - int32_t neh0; - int32_t neh1; - uint64_t nbh1; - uint64_t nbh2; - int32_t ne0; - uint64_t nb1; - uint64_t nb2; -} ggml_metal_kargs_mul_mm_id_map1; - typedef struct { int32_t ne00; int32_t ne02; uint64_t nb01; uint64_t nb02; uint64_t nb03; - int32_t neh12; - uint64_t nbh10; - uint64_t nbh11; - uint64_t nbh12; - uint64_t nbh13; - int32_t neh0; - int32_t neh1; + int32_t ne11; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + int32_t ne20; + int32_t ne21; + int32_t ne0; + int32_t ne1; int16_t r2; int16_t r3; } ggml_metal_kargs_mul_mm_id; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a8e5815cd..7fa651f71 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -93,35 +93,37 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); - ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; - ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + if (ctx->mtl_device) { + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; - ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; + ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; #if defined(GGML_METAL_HAS_RESIDENCY_SETS) - ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil; + ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil; #endif - ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; - ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; + ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; + ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6]; #if defined(GGML_METAL_USE_BF16) - ctx->use_bfloat = ctx->has_bfloat; + ctx->use_bfloat = ctx->has_bfloat; #else - ctx->use_bfloat = false; + ctx->use_bfloat = false; #endif - ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; + ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; - { - const char * val = getenv("GGML_METAL_FUSION_DEBUG"); - ctx->debug_fusion = val ? atoi(val) : 0; + { + const char * val = getenv("GGML_METAL_FUSION_DEBUG"); + ctx->debug_fusion = val ? atoi(val) : 0; + } + + memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt)); + + ctx->max_size = ctx->mtl_device.maxBufferLength; + + strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } - - memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt)); - - ctx->max_size = ctx->mtl_device.maxBufferLength; - - strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1); } ctx->mtl_device_ref_count++; @@ -396,8 +398,12 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, - GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_1, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_2, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16, @@ -1426,8 +1432,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, mul_mm_id_map0_f16, has_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, mul_mm_id_map1_f32, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_1, mul_mm_id_map0_f16_ne20_1, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_2, mul_mm_id_map0_f16_ne20_2, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4, mul_mm_id_map0_f16_ne20_4, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6, mul_mm_id_map0_f16_ne20_6, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8, mul_mm_id_map0_f16_ne20_8, has_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16, mul_mm_id_map0_f16_ne20_16, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, mul_mm_id_f32_f16, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16, mul_mm_id_f16_f16, has_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16, mul_mm_id_bf16_f16, has_simdgroup_mm && use_bfloat); @@ -1874,7 +1884,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_ROPE: return true; case GGML_OP_IM2COL: - return op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); + return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); case GGML_OP_POOL_1D: return false; case GGML_OP_UPSCALE: @@ -3906,38 +3916,6 @@ static int ggml_metal_encode_node( default: break; } - const int64_t neh10 = ne10; // n_embd - const int64_t neh11 = ne21; // n_tokens - const int64_t neh12 = ne02; // n_expert - - const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16); - const uint64_t nbh11 = nbh10*neh10; - const uint64_t nbh12 = nbh11*neh11; - const uint64_t nbh13 = nbh12*neh12; - - const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12; - id h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1); - if (!h_src1) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1); - return 0; - } - - const int64_t neh0 = ne0; - const int64_t neh1 = ne21; - const int64_t neh2 = ne02; - - const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32); - const uint64_t nbh1 = nbh0*neh0; - const uint64_t nbh2 = nbh1*neh1; - //const uint64_t nbh3 = nbh2*neh2; - - const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2; - id h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst); - if (!h_dst) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst); - return 0; - } - // tokens per expert const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02; id h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe); @@ -3947,8 +3925,8 @@ static int ggml_metal_encode_node( } // id map - // [n_expert_used, n_tokens] - const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21; + // [n_tokens, n_expert] + const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne21*ne02; id h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids); if (!h_ids) { GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids); @@ -3956,32 +3934,45 @@ static int ggml_metal_encode_node( } { - const int nth = MIN(1024, ne10/4); - ggml_metal_kargs_mul_mm_id_map0 args = { + ne02, ne10, - ne11, // n_expert_used (bcast) + ne11, // n_expert_used (bcast) nb11, nb12, - neh11, // n_tokens - nbh11, - ne20, // n_expert_used + ne21, // n_tokens + ne20, // n_expert_used nb21, }; id pipeline = nil; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline; + pipeline = nil; + + switch (ne20) { + case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_1 ].pipeline; break; + case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_2 ].pipeline; break; + case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_4 ].pipeline; break; + case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_6 ].pipeline; break; + case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_8 ].pipeline; break; + case 16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16_NE20_16].pipeline; break; + default: GGML_ABORT("missing specialization for ne20 = %d", (int) ne20); + } + + GGML_ASSERT(ne02 <= (int) pipeline.maxTotalThreadsPerThreadgroup); + + const size_t smem = ne02*ne20*sizeof(uint16_t); + + GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength); [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; - [encoder setBuffer: h_src1 offset:0 atIndex:3]; - [encoder setBuffer: h_tpe offset:0 atIndex:4]; - [encoder setBuffer: h_ids offset:0 atIndex:5]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:1]; + [encoder setBuffer: h_tpe offset:0 atIndex:2]; + [encoder setBuffer: h_ids offset:0 atIndex:3]; + [encoder setThreadgroupMemoryLength:smem atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(ne02, 1, 1)]; } { @@ -4020,13 +4011,15 @@ static int ggml_metal_encode_node( /*.nb01 =*/ nb01, /*.nb02 =*/ nb02, /*.nb03 =*/ nb03, - /*.neh12 =*/ neh12, - /*.nbh10 =*/ nbh10, - /*.nbh11 =*/ nbh11, - /*.nbh12 =*/ nbh12, - /*.nbh13 =*/ nbh13, - /*.neh0 =*/ neh0, - /*.neh1 =*/ neh1, + /*.ne11 =*/ ne11, // n_expert_used (bcast) + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne20 =*/ ne20, // n_expert_used + /*.ne21 =*/ ne21, // n_tokens + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, /*.r2 =*/ r2, /*.r3 =*/ r3, }; @@ -4034,42 +4027,14 @@ static int ggml_metal_encode_node( [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer: h_src1 offset:0 atIndex:2]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; [encoder setBuffer: h_tpe offset:0 atIndex:3]; - [encoder setBuffer: h_dst offset:0 atIndex:4]; + [encoder setBuffer: h_ids offset:0 atIndex:4]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:5]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } - - { - GGML_ASSERT(ne0 % 4 == 0); - - const int nth = MIN(1024, ne0/4); - - ggml_metal_kargs_mul_mm_id_map1 args = { - ne20, // n_expert_used - neh0, - neh1, - nbh1, - nbh2, - ne0, - nb1, - nb2, - }; - - id pipeline = nil; - - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline; - - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer: h_dst offset:0 atIndex:1]; - [encoder setBuffer: h_ids offset:0 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } } else { id pipeline = nil; @@ -4729,7 +4694,6 @@ static int ggml_metal_encode_node( } break; case GGML_OP_IM2COL: { - GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3dd55fd32..7037c1aa0 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -974,9 +974,16 @@ kernel void kernel_mul( device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10)); + if (args.ne10 == 1) { + const float x = *((device float *)(src1_ptr)); + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x; + } + } else { + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = i0%args.ne10; + *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10)); + } } } @@ -1000,9 +1007,16 @@ kernel void kernel_div( device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10)); + if (args.ne10 == 1) { + const float x = 1.0f / *((device float *)(src1_ptr)); + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x; + } + } else { + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = i0%args.ne10; + *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10)); + } } } @@ -7491,97 +7505,81 @@ kernel void kernel_mul_mm( } } -template +template // n_expert_used kernel void kernel_mul_mm_id_map0( constant ggml_metal_kargs_mul_mm_id_map0 & args, - device const char * src1, device const char * src2, - device char * hsrc1, device char * htpe, device char * hids, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int ide = tgpig[0]; // expert id + threadgroup char * shmem [[threadgroup(0)]], + ushort tpitg[[thread_position_in_threadgroup]], + ushort ntg[[threads_per_threadgroup]]) { + const short ide = tpitg; // expert id - int n_all = 0; + uint32_t n_all = 0; - device int32_t * ids_i32 = (device int32_t *) (hids); + device int32_t * ids_i32 = (device int32_t *) hids + ide*args.ne21; - for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens - device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21); + for (int i21 = 0; i21 < args.ne21; i21 += ntg) { // n_tokens + if (i21 + tpitg < args.ne21) { + device const int32_t * src2_i32 = (device const int32_t *) (src2 + (i21 + tpitg)*args.nb21); - for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used - if (src2_i32[i20] != ide) { - continue; + threadgroup uint16_t * sids = (threadgroup uint16_t *) shmem + tpitg*ne20; + + #pragma unroll(ne20) + for (short i20 = 0; i20 < ne20; i20++) { + sids[i20] = src2_i32[i20]; } - - device const float4 * src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11); - device T4 * hsrc1_f32x4 = (device T4 *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11); - - for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) { - hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]); - } - - if (tpitg.x == 0) { - ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all; - } - - ++n_all; } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short t = 0; t < ntg; t++) { + if (i21 + t >= args.ne21) { + break; + } + + threadgroup const uint16_t * sids = (threadgroup const uint16_t *) shmem + t*ne20; + + short sel = 0; + #pragma unroll(ne20) + for (short i20 = 0; i20 < ne20; i20++) { + sel += (sids[i20] == ide)*(i20 + 1); + } + + ids_i32[n_all] = (i21 + t)*ne20 + sel - 1; + + n_all += sel > 0; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); } - if (tpitg.x == 0) { - device int32_t * tpe_i32 = (device int32_t *) (htpe); - tpe_i32[ide] = n_all; - } + device uint32_t * tpe_u32 = (device uint32_t *) (htpe); + tpe_u32[ide] = n_all; } -typedef decltype(kernel_mul_mm_id_map0) kernel_mul_mm_id_map0_t; +typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t; -template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0; - -template -kernel void kernel_mul_mm_id_map1( - constant ggml_metal_kargs_mul_mm_id_map1 & args, - device const char * hdst, - device const char * hids, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i20 = tgpig[0]; // used expert - const int i21 = tgpig[1]; // token - - device const int32_t * ids_i32 = (device const int32_t *) (hids); - device float4 * dst_f32x4 = (device float4 *) (dst + i20*args.nb1 + i21*args.nb2); - - const int id = ids_i32[i21*args.ne20 + i20]; - - const int ide = id / args.neh1; - const int idt = id % args.neh1; - - device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2); - - for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) { - dst_f32x4[i0] = hdst_f32x4[i0]; - } -} - -typedef decltype(kernel_mul_mm_id_map1) kernel_mul_mm_id_map1_t; - -template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>; +template [[host_name("kernel_mul_mm_id_map0_f16_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>; template kernel void kernel_mul_mm_id( constant ggml_metal_kargs_mul_mm_id & args, device const char * src0, device const char * src1, - device const char * tpe, + device const char * htpe, + device const char * hids, device char * dst, threadgroup char * shmem [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], ushort tiitg[[thread_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], ushort sgitg[[simdgroup_index_in_threadgroup]]) { threadgroup T * sa = (threadgroup T *)(shmem); @@ -7589,19 +7587,20 @@ kernel void kernel_mul_mm_id( const int r0 = tgpig.y; const int r1 = tgpig.x; - const int im = tgpig.z; + const int im = tgpig.z; // expert - device const int32_t * tpe_i32 = (device const int32_t *) (tpe); + device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe); + device const int32_t * ids_i32 = (device const int32_t *) (hids); - const int neh1 = tpe_i32[im]; + const int32_t neh1 = tpe_u32[im]; if (r1*BLOCK_SIZE_N >= neh1) { return; } // if this block is of 64x32 shape or smaller - const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; - const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; + const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; @@ -7617,20 +7616,23 @@ kernel void kernel_mul_mm_id( short il = (tiitg % THREAD_PER_ROW); - const int i12 = im%args.neh12; - const int i13 = im/args.neh12; + const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + thread_col]; - const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + const short i11 = (id % args.ne20) % args.ne11; + const short i12 = (id / args.ne20); + const short i13 = 0; + + const uint64_t offset0 = im*args.nb02 + i13*args.nb03; const short offset1 = il/nl; device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1; - device const half * y = (device const half *)(src1 - + args.nbh13*i13 - + args.nbh12*i12 - + args.nbh11*(r1*BLOCK_SIZE_N + thread_col) - + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + device const float * y = (device const float *)(src1 + + args.nb13*i13 + + args.nb12*i12 + + args.nb11*i11 + + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { // load data and store to threadgroup memory @@ -7646,7 +7648,7 @@ kernel void kernel_mul_mm_id( + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y); + *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (half2x4)(*((device float2x4 *) y)); il = (il + 2 < nl) ? il + 2 : il % 2; x = (il < 2) ? x + (2 + nl - 1)/nl : x; @@ -7682,43 +7684,38 @@ kernel void kernel_mul_mm_id( } } - if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) { - device float * C = (device float *) dst + - (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \ - (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0; + threadgroup_barrier(mem_flags::mem_threadgroup); - for (short i = 0; i < 8; i++) { - simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0); - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix - threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float * temp_str = ((threadgroup float *) shmem) \ - + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; - for (short i = 0; i < 8; i++) { - simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); + threadgroup float * temp_str = ((threadgroup float *) shmem) \ + + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; + + #pragma unroll(8) + for (short i = 0; i < 8; i++) { + simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (short j = sgitg; j < n_cols; j += 4) { + const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + j]; + + const short ide = id % args.ne20; + const short idt = id / args.ne20; + + device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + ide*args.ne0 + idt*args.ne1*args.ne0; + device float4 * D4 = (device float4 *) D; + + threadgroup float * C = (threadgroup float *) shmem + (j*BLOCK_SIZE_M); + threadgroup float4 * C4 = (threadgroup float4 *) C; + + int i = tiisg; + for (; i < n_rows/4; i += 32) { + *(D4 + i) = *(C4 + i); } - threadgroup_barrier(mem_flags::mem_threadgroup); - - if (sgitg == 0) { - for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0; - device float4 * D4 = (device float4 *) D; - - threadgroup float * C = temp_str + (j*BLOCK_SIZE_M); - threadgroup float4 * C4 = (threadgroup float4 *) C; - - int i = 0; - for (; i < n_rows/4; i++) { - *(D4 + i) = *(C4 + i); - } - - i *= 4; - for (; i < n_rows; i++) { - *(D + i) = *(C + i); - } - } + i = (4*(n_rows/4)) + tiisg; + for (; i < n_rows; i += 32) { + *(D + i) = *(C + i); } } } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 51f46cf7b..9e39e5c3c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2106,10 +2106,11 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec const uint32_t warps = warptile[0] / warptile[10]; const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; - const uint32_t mmid_row_ids = mul_mat_id ? (4096 * sizeof(uint32_t) + 4/*_ne1*/) : 0; + const uint32_t mmid_row_ids = mul_mat_id ? (warptile[2] * 2 * sizeof(uint16_t)) : 0; const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; + const uint32_t ballots_sh = mul_mat_id ? (warps * 4 * sizeof(uint32_t)) : 0; - const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size; + const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size + ballots_sh; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), " @@ -2199,7 +2200,7 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u); const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) || - (device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16); + (device->subgroup_size_control && device->subgroup_max_size >= 16); // mulmat std::vector l_warptile, m_warptile, s_warptile, @@ -6318,7 +6319,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t nei0 = ids->ne[0]; const uint64_t nei1 = ids->ne[1]; - GGML_ASSERT(nei0 * nei1 <= 4096); const uint32_t nbi1 = ids->nb[1]; const uint32_t nbi2 = ids->nb[2]; @@ -6758,37 +6758,7 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } else { - // Split based on number of ids, to fit in shared memory - const uint32_t nei0 = (uint32_t)src2->ne[0]; - const uint32_t nei1 = (uint32_t)src2->ne[1]; - - GGML_ASSERT(nei0 <= 4096); - const uint32_t split_size = std::min(nei1, 4096u / nei0); - - if (split_size == nei1) { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); - } else { - ggml_tensor src1_copy = *src1; - ggml_tensor src2_copy = *src2; - ggml_tensor dst_copy = *dst; - - for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { - const uint32_t n_tokens = std::min(split_size, nei1 - token_start); - - src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; - src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; - dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; - - src1_copy.ne[2] = n_tokens; - src2_copy.ne[1] = n_tokens; - dst_copy.ne[2] = n_tokens; - - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); - // invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor - ctx->prealloc_y_last_pipeline_used = {}; - ctx->prealloc_y_last_tensor_used = nullptr; - } - } + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 40c0d9b0c..5ecf68a64 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -109,13 +109,13 @@ shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; #define NUM_WARPS (BLOCK_SIZE / WARP) #ifdef MUL_MAT_ID -shared u16vec2 row_ids[4096]; +shared u16vec2 row_ids[BN]; uint _ne1; #ifdef MUL_MAT_ID_USE_SUBGROUPS shared uvec4 ballots_sh[NUM_WARPS]; -void load_row_ids(uint expert_idx, bool nei0_is_pow2) { +void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) { _ne1 = 0; uint num_elements = p.nei1 * p.nei0; uint nei0shift = findLSB(p.nei0); @@ -165,11 +165,14 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2) { barrier(); uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot); - if (in_range && id == expert_idx) { - row_ids[_ne1 + idx] = u16vec2(ii0, ii1); + if (in_range && id == expert_idx && _ne1 + idx >= ic * BN && _ne1 + idx < (ic + 1) * BN) { + row_ids[_ne1 + idx - ic * BN] = u16vec2(ii0, ii1); } _ne1 += total; iter &= 15; + if (_ne1 >= (ic + 1) * BN) { + break; + } } barrier(); } @@ -242,16 +245,18 @@ void main() { #ifdef MUL_MAT_ID #ifdef MUL_MAT_ID_USE_SUBGROUPS if (bitCount(p.nei0) == 1) { - load_row_ids(expert_idx, true); + load_row_ids(expert_idx, true, ic); } else { - load_row_ids(expert_idx, false); + load_row_ids(expert_idx, false, ic); } #else _ne1 = 0; - for (uint ii1 = 0; ii1 < p.nei1; ii1++) { - for (uint ii0 = 0; ii0 < p.nei0; ii0++) { + for (uint ii1 = 0; ii1 < p.nei1 && _ne1 < (ic + 1) * BN; ii1++) { + for (uint ii0 = 0; ii0 < p.nei0 && _ne1 < (ic + 1) * BN; ii0++) { if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) { - row_ids[_ne1] = u16vec2(ii0, ii1); + if (_ne1 >= ic * BN) { + row_ids[_ne1 - ic * BN] = u16vec2(ii0, ii1); + } _ne1++; } } @@ -797,7 +802,7 @@ void main() { [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { #if LOAD_VEC_B == 8 #ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l]; + const u16vec2 row_idx = row_ids[loadc_b + l]; const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; @@ -813,7 +818,7 @@ void main() { buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w); #elif LOAD_VEC_B == 4 #ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l]; + const u16vec2 row_idx = row_ids[loadc_b + l]; const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; @@ -832,7 +837,7 @@ void main() { #else const uint row_i = ic * BN + loadc_b + l; if (row_i < _ne1 && block + loadr_b < end_k) { - const u16vec2 row_idx = row_ids[row_i]; + const u16vec2 row_idx = row_ids[loadc_b + l]; buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); } else { buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); @@ -903,7 +908,7 @@ void main() { const uint row_i = dc + cm_col * TN + col + store_c; if (row_i >= _ne1) break; - const u16vec2 row_idx = row_ids[row_i]; + const u16vec2 row_idx = row_ids[row_i - ic * BN]; if (dr + cm_row * TM + store_r < p.M) { data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); @@ -953,7 +958,7 @@ void main() { const uint row_i = dc_warp + cc; if (row_i >= _ne1) break; - const u16vec2 row_idx = row_ids[row_i]; + const u16vec2 row_idx = row_ids[row_i - ic * BN]; #endif // MUL_MAT_ID [[unroll]] for (uint cr = 0; cr < TM; cr++) { #ifdef MUL_MAT_ID diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 4d16eb079..f5aebf6e9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -93,7 +93,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; -shared u16vec4 row_ids[4096]; +shared u16vec4 row_ids[BN]; layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { B_TYPE b[]; @@ -111,7 +111,7 @@ B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const i return B_TYPE(0.0); } - const u16vec4 row_idx = row_ids[row_i]; + const u16vec4 row_idx = row_ids[row_i & (BN - 1)]; B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]]; return ret; @@ -123,14 +123,14 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem uint dc = ic * BN + c; if (dr < p.M && dc < _ne1) { - uint row_i = dc; + uint row_i = c; const u16vec4 row_idx = row_ids[row_i]; data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem; } return elem; } -void load_row_ids(uint expert_idx, bool nei0_is_pow2) { +void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) { _ne1 = 0; uint num_elements = p.nei1 * p.nei0; uint nei0shift = findLSB(p.nei0); @@ -180,11 +180,14 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2) { barrier(); uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot); - if (in_range && id == expert_idx) { - row_ids[_ne1 + idx] = u16vec4(fastmod(ii0, p.ne11), ii1, ii0, 0); + if (in_range && id == expert_idx && _ne1 + idx >= ic * BN && _ne1 + idx < (ic + 1) * BN) { + row_ids[_ne1 + idx - ic * BN] = u16vec4(fastmod(ii0, p.ne11), ii1, ii0, 0); } _ne1 += total; iter &= 15; + if (_ne1 >= (ic + 1) * BN) { + break; + } } barrier(); } @@ -218,9 +221,9 @@ void main() { #ifdef MUL_MAT_ID if (bitCount(p.nei0) == 1) { - load_row_ids(expert_idx, true); + load_row_ids(expert_idx, true, ic); } else { - load_row_ids(expert_idx, false); + load_row_ids(expert_idx, false, ic); } // Workgroup has no work diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d03a02c7b..b9d1235d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2850,6 +2850,7 @@ class VisionProjectorType: QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" LFM2 = "lfm2" + KIMIVL = "kimivl" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 87edaa323..abb21fa82 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -427,7 +427,6 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid - "model.layers.{bid}.block_sparse_moe.gate", # smallthinker "model.transformer.blocks.{bid}.ff_proj", # llada "layers.{bid}.mlp.gate_proj", # qwen3-embedding ), @@ -1123,6 +1122,7 @@ class TensorNameMap: "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl + "vision_tower.patch_embed.proj", # kimi-vl ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1131,6 +1131,7 @@ class TensorNameMap: "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 + "vision_tower.patch_embed.pos_emb", # kimi-vl ), MODEL_TENSOR.V_ENC_ATTN_Q: ( @@ -1142,6 +1143,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated + "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1158,6 +1160,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated + "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1174,6 +1177,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated + "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1186,6 +1190,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4 "visual.blocks.{bid}.norm1", # qwen2vl + "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1198,6 +1203,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral "visual.blocks.{bid}.attn.proj", # qwen2vl + "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1210,6 +1216,7 @@ class TensorNameMap: "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral "visual.blocks.{bid}.norm2", # qwen2vl + "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1222,6 +1229,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.mlp.fc1", # llama4 "visual.blocks.{bid}.mlp.fc1", # qwen2vl "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl + "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1240,6 +1248,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.mlp.fc2", # llama4 "visual.blocks.{bid}.mlp.fc2", # qwen2vl "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl + "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) ), MODEL_TENSOR.V_LAYER_SCALE_1: ( @@ -1264,6 +1273,7 @@ class TensorNameMap: "model.vision_model.post_layernorm", # SmolVLM "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl + "vision_tower.encoder.final_layernorm", # kimi-vl ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1273,6 +1283,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", "multi_modal_projector.layer_norm", + "multi_modal_projector.pre_norm", "pre_mm_projector_norm", ), diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e9e92d6b3..e3301284e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -280,7 +280,7 @@ llama_context::llama_context( } // reserve worst-case graph - if (!hparams.vocab_only && memory) { + if (!hparams.vocab_only) { const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -292,11 +292,13 @@ llama_context::llama_context( int n_splits_tg = -1; int n_nodes_tg = -1; - // simulate full KV cache - - const auto mctx = memory->init_full(); - if (!mctx) { - throw std::runtime_error("failed to initialize KV cache"); + llama_memory_context_ptr mctx; + if (memory) { + LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__); + mctx = memory->init_full(); + if (!mctx) { + throw std::runtime_error("failed to initialize memory module"); + } } cross.v_embd.clear(); @@ -1056,7 +1058,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); if (!res) { - // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache + // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module llama_pos pos_min[LLAMA_MAX_SEQ]; for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { pos_min[s] = std::numeric_limits::max(); @@ -1073,7 +1075,7 @@ int llama_context::decode(const llama_batch & batch_inp) { continue; } - LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]); + LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]); memory->seq_rm(s, pos_min[s], -1); } @@ -1857,7 +1859,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { } if (memory != nullptr) { - LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); + LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__); memory->state_write(io); } @@ -1943,7 +1945,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { } if (memory) { - LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); + LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__); memory->state_read(io); } diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 706ed2e3b..664b0c9ac 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -135,6 +135,7 @@ enum projector_type { PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_LFM2, + PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -156,6 +157,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_LFM2, "lfm2"}, + { PROJECTOR_TYPE_KIMIVL, "kimivl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a9ed24c0c..389fefbb0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -551,57 +551,16 @@ struct clip_graph { cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { + // pixel_shuffle // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.proj_scale_factor; - const int n_embd = cur->ne[0]; - const int seq = cur->ne[1]; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - const int height = std::sqrt(seq); - const int width = std::sqrt(seq); - GGML_ASSERT(scale_factor != 0); - cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, - n_embd * scale_factor * scale_factor, - height / scale_factor, - width / scale_factor, - bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_3d(ctx0, cur, - n_embd * scale_factor * scale_factor, - seq / (scale_factor * scale_factor), - bsz); - + cur = build_patch_merge_permute(cur, scale_factor); cur = ggml_mul_mat(ctx0, model.projection, cur); + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block const int scale_factor = model.hparams.proj_scale_factor; - GGML_ASSERT(scale_factor > 1); - - const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; - - // pad width and height to factor - const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; - const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; - cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); - if (pad_width || pad_height) { - cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); - width += pad_width; - height += pad_height; - } - - // unshuffle h - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - // unshuffle w - cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = build_patch_merge_permute(cur, scale_factor); // projection cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm @@ -1111,7 +1070,7 @@ struct clip_graph { n_patches_x / scale_factor, n_patches_y / scale_factor, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D cur = ggml_cont_2d(ctx0, cur, n_embd * scale_factor * scale_factor, @@ -1138,6 +1097,67 @@ struct clip_graph { return gf; } + ggml_cgraph * build_kimivl() { + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.proj_scale_factor; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + cur = ggml_view_2d(ctx0, cur, + n_embd, cur->ne[1] * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = ggml_add(ctx0, cur, model.mm_2_b); + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + // this graph is used by llava, granite and glm // due to having embedding_stack (used by granite), we cannot reuse build_vit ggml_cgraph * build_llava() { @@ -1636,18 +1656,20 @@ private: ggml_tensor * pos_embd = model.position_embeddings; const int height = img.ny / patch_size; const int width = img.nx / patch_size; + const uint32_t mode = GGML_SCALE_MODE_BILINEAR; + const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); - if (!pos_embd || height * width == pos_embd->ne[1]) { + GGML_ASSERT(pos_embd); + + if (height == n_per_side && width == n_per_side) { return pos_embd; } - const int n_pos_embd = std::sqrt(pos_embd->ne[1]); - pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_pos_embd, n_pos_embd); // -> (n_embd, n_pos_embd, n_pos_embd) - pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_pos_embd, n_pos_embd, n_embd) - pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, 1); // -> (width, height, n_embd) - pos_embd = ggml_reshape_2d(ctx0, pos_embd, height * width, n_embd); // -> (height * width, n_embd) - pos_embd = ggml_transpose(ctx0, pos_embd); // -> (n_embd, height * width) - pos_embd = ggml_cont(ctx0, pos_embd); + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) return pos_embd; } @@ -2046,6 +2068,39 @@ private: return cur; } + // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) + // support dynamic resolution + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cb(cur, "pixel_shuffle", -1); + + return cur; + } + }; static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { @@ -2088,6 +2143,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_whisper_enc(); } break; + case PROJECTOR_TYPE_KIMIVL: + { + res = graph.build_kimivl(); + } break; default: { res = graph.build_llava(); @@ -2242,6 +2301,8 @@ struct clip_model_loader { hparams.minicpmv_query_num = 64; } else if (hparams.minicpmv_version == 5) { hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 6) { + hparams.minicpmv_query_num = 64; } else { hparams.minicpmv_query_num = 96; } @@ -2357,6 +2418,12 @@ struct clip_model_loader { hparams.image_size = 1024; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); } break; + case PROJECTOR_TYPE_KIMIVL: + { + hparams.rope_theta = 10000.0f; + hparams.warmup_image_size = hparams.patch_size * 8; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; case PROJECTOR_TYPE_GEMMA3: { // default value (used by all model sizes in gemma 3 family) @@ -2526,7 +2593,20 @@ struct clip_model_loader { // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! - if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) { + bool is_ffn_swapped = ( + // only old models need this fix + model.proj_type == PROJECTOR_TYPE_MLP + || model.proj_type == PROJECTOR_TYPE_MLP_NORM + || model.proj_type == PROJECTOR_TYPE_LDP + || model.proj_type == PROJECTOR_TYPE_LDPV2 + || model.proj_type == PROJECTOR_TYPE_QWEN2VL + || model.proj_type == PROJECTOR_TYPE_QWEN25VL + || model.proj_type == PROJECTOR_TYPE_GLM_EDGE + || model.proj_type == PROJECTOR_TYPE_GEMMA3 + || model.proj_type == PROJECTOR_TYPE_IDEFICS3 + || model.proj_type == PROJECTOR_TYPE_MINICPMV + ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd; + if (is_ffn_swapped) { // swap up and down weights ggml_tensor * tmp = layer.ff_up_w; layer.ff_up_w = layer.ff_down_w; @@ -2535,6 +2615,9 @@ struct clip_model_loader { tmp = layer.ff_up_b; layer.ff_up_b = layer.ff_down_b; layer.ff_down_b = tmp; + if (il == 0) { + LOG_WRN("%s: ffn up/down are swapped\n", __func__); + } } } @@ -2653,6 +2736,7 @@ struct clip_model_loader { model.projection = get_tensor(TN_MM_PROJECTOR); } break; case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: { model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); @@ -3681,7 +3765,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_y = inst.grid_size.height; return true; - } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { + } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2 + || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL + ) { GGML_ASSERT(params.proj_scale_factor); // smart resize @@ -3861,6 +3947,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } else if (params.minicpmv_version == 5) { // MiniCPM-V 4.0 n_patches = 64; + } else if (params.minicpmv_version == 6) { + // MiniCPM-V 4.5 + n_patches = 64; } else { GGML_ABORT("Unknown minicpmv version"); } @@ -3879,12 +3968,21 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_LLAMA4: - case PROJECTOR_TYPE_LFM2: { - // both W and H are divided by proj_scale_factor + // both X and Y are downscaled by the scale factor int scale_factor = ctx->model.hparams.proj_scale_factor; n_patches /= (scale_factor * scale_factor); } break; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + { + // dynamic size + int scale_factor = ctx->model.hparams.proj_scale_factor; + int out_patch_size = params.patch_size * scale_factor; + int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + n_patches = x_patch * y_patch; + } break; case PROJECTOR_TYPE_PIXTRAL: { // dynamic size @@ -4267,6 +4365,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_KIMIVL: { // set the 2D positions int n_patches_per_col = image_size_width / patch_size; @@ -4601,6 +4700,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: return ctx->model.mm_2_w->ne[1]; default: GGML_ABORT("Unknown projector type"); diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index 4dda60a21..f34d858d6 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -607,6 +607,9 @@ else: elif minicpmv_version == 5: emb_dim = 2560 block_count = 27 + elif minicpmv_version == 6: + emb_dim = 4096 + block_count = 27 default_vision_config = { "hidden_size": 1152, @@ -630,6 +633,10 @@ elif minicpmv_version == 5: default_vision_config["model_type"] = "siglip_vision_model" vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 6: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) processor = None # if model.attn_pool is not None: diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index a05373d5b..cd022c5e2 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -207,7 +207,7 @@ struct mtmd_context { tok_row_end_trail = false; // no trailing end-of-row token ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) { + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) { // minicpmv 2.6 format: // (overview) (slice) (slice) \n ... slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 6f8a5f86a..c64be0363 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -86,6 +86,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra + add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"