diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3e3db999c..859c1443f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8943,6 +8943,13 @@ class SmolLM3Model(LlamaModel): class GptOssModel(TextModel): model_arch = gguf.MODEL_ARCH.GPT_OSS + # TODO: remove once MXFP4 is supported more generally + def dequant_model(self): + quant_config = self.hparams.get("quantization_config") + if quant_config is not None and quant_config.get("quant_method") == "mxfp4": + return + return super().dequant_model() + def transform_nibble_layout(self, tensor): assert tensor.dtype == torch.uint8 assert tensor.shape[-1] == 16 diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 607ded855..6e7b90d42 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -1,5 +1,81 @@ #include "argsort.cuh" +#ifdef GGML_CUDA_USE_CUB +# include +using namespace cub; +#endif // GGML_CUDA_USE_CUB + +static __global__ void init_indices(int * indices, const int ncols, const int nrows) { + const int col = blockIdx.x * blockDim.x + threadIdx.x; + const int row = blockIdx.y; + + if (col < ncols && row < nrows) { + indices[row * ncols + col] = col; + } +} + +static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx <= nrows) { + offsets[idx] = idx * ncols; + } +} + +#ifdef GGML_CUDA_USE_CUB +static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, + const float * x, + int * dst, + const int ncols, + const int nrows, + ggml_sort_order order, + cudaStream_t stream) { + ggml_cuda_pool_alloc temp_indices_alloc(pool, ncols * nrows); + ggml_cuda_pool_alloc temp_keys_alloc(pool, ncols * nrows); + ggml_cuda_pool_alloc offsets_alloc(pool, nrows + 1); + + int * temp_indices = temp_indices_alloc.get(); + float * temp_keys = temp_keys_alloc.get(); + int * d_offsets = offsets_alloc.get(); + + static const int block_size = 256; + const dim3 grid_size((ncols + block_size - 1) / block_size, nrows); + init_indices<<>>(temp_indices, ncols, nrows); + + const dim3 offset_grid((nrows + block_size - 1) / block_size); + init_offsets<<>>(d_offsets, ncols, nrows); + + cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream); + + size_t temp_storage_bytes = 0; + + if (order == GGML_SORT_ORDER_ASC) { + DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) + temp_indices, dst, // values (indices) + ncols * nrows, nrows, // num items, num segments + d_offsets, d_offsets + 1, 0, sizeof(float) * 8, // all bits + stream); + } else { + DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices, + dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, + sizeof(float) * 8, stream); + } + + ggml_cuda_pool_alloc temp_storage_alloc(pool, temp_storage_bytes); + void * d_temp_storage = temp_storage_alloc.get(); + + if (order == GGML_SORT_ORDER_ASC) { + DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, + ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8, + stream); + } else { + DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, + temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, + 0, sizeof(float) * 8, stream); + } +} +#endif // GGML_CUDA_USE_CUB + +// Bitonic sort implementation template static inline __device__ void ggml_cuda_swap(T & a, T & b) { T tmp = a; @@ -65,7 +141,12 @@ static int next_power_of_2(int x) { return n; } -static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { +static void argsort_f32_i32_cuda_bitonic(const float * x, + int * dst, + const int ncols, + const int nrows, + ggml_sort_order order, + cudaStream_t stream) { // bitonic sort requires ncols to be power of 2 const int ncols_pad = next_power_of_2(ncols); @@ -77,9 +158,11 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); if (order == GGML_SORT_ORDER_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + k_argsort_f32_i32 + <<>>(x, dst, ncols, ncols_pad); } else if (order == GGML_SORT_ORDER_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + k_argsort_f32_i32 + <<>>(x, dst, ncols, ncols_pad); } else { GGML_ABORT("fatal error"); } @@ -100,5 +183,18 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); +#ifdef GGML_CUDA_USE_CUB + const int ncols_pad = next_power_of_2(ncols); + const size_t shared_mem = ncols_pad * sizeof(int); + const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb; + + if (shared_mem > max_shared_mem || ncols > 1024) { + ggml_cuda_pool & pool = ctx.pool(); + argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream); + } else { + argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); + } +#else + argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream); +#endif } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 602401027..0e6d777b1 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]); const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); - if (block_nums.z > 65535) { + if (block_nums.z > 65535 || block_nums.y > 65535) { int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1)); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1f90edb72..675275460 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3655,8 +3655,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SUM: return ggml_is_contiguous_rows(op->src[0]); case GGML_OP_ARGSORT: - // TODO: Support arbitrary column width +#ifndef GGML_CUDA_USE_CUB return op->src[0]->ne[0] <= 1024; +#else + return true; +#endif case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_GROUP_NORM: diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 7b56d87e4..026b53b28 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/src/routes/+page.svelte b/tools/server/webui/src/routes/+page.svelte index 2cd2d5c37..cd18dabcc 100644 --- a/tools/server/webui/src/routes/+page.svelte +++ b/tools/server/webui/src/routes/+page.svelte @@ -2,6 +2,9 @@ import { ChatScreen } from '$lib/components/app'; import { chatStore, isInitialized } from '$lib/stores/chat.svelte'; import { onMount } from 'svelte'; + import { page } from '$app/state'; + + let qParam = $derived(page.url.searchParams.get('q')); onMount(async () => { if (!isInitialized) { @@ -9,6 +12,11 @@ } chatStore.clearActiveConversation(); + + if (qParam !== null) { + await chatStore.createConversation(); + await chatStore.sendMessage(qParam); + } });