Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/vulkan.Dockerfile
#	benches/dgx-spark/dgx-spark.md
#	scripts/bench-models.sh
This commit is contained in:
Concedo 2026-02-05 22:24:12 +08:00
commit ada982b7c1
9 changed files with 120 additions and 12 deletions

View file

@ -176,6 +176,26 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows(ggml_me
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag(ggml_metal_library_t lib, const ggml_tensor * op) {
char base[256];
char name[256];
const int n = op->src[0]->ne[0];
snprintf(base, 256, "kernel_diag_%s", ggml_type_name(op->src[0]->type));
snprintf(name, 256, "%s_n=%d", base, n);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
}
res.nsg = 1;
res.smem = 0;
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_metal_library_t lib, ggml_type tsrc) {
char base[256];
char name[256];

View file

@ -108,6 +108,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_1d
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);

View file

@ -1158,8 +1158,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
return has_simdgroup_reduction;
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
case GGML_OP_SOLVE_TRI:
return true;
case GGML_OP_SOLVE_TRI:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return has_simdgroup_reduction;
@ -1241,6 +1241,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
return false;
};
}
case GGML_OP_DIAG:
return true;
case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
return has_simdgroup_reduction;

View file

@ -792,6 +792,25 @@ typedef struct {
uint64_t nb3;
} ggml_metal_kargs_set_rows;
typedef struct {
int32_t ne00;
int32_t ne01;
int32_t ne02;
int32_t ne03;
uint64_t nb00;
uint64_t nb01;
uint64_t nb02;
uint64_t nb03;
int32_t ne0;
int32_t ne1;
int32_t ne2;
int32_t ne3;
uint64_t nb0;
uint64_t nb1;
uint64_t nb2;
uint64_t nb3;
} ggml_metal_kargs_diag;
typedef struct {
int64_t ne00;
int64_t ne01;

View file

@ -361,6 +361,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
{
n_fuse = ggml_metal_op_set_rows(ctx, idx);
} break;
case GGML_OP_DIAG:
{
n_fuse = ggml_metal_op_diag(ctx, idx);
} break;
case GGML_OP_L2_NORM:
{
n_fuse = ggml_metal_op_l2_norm(ctx, idx);
@ -1259,6 +1263,48 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
return 1;
}
int ggml_metal_op_diag(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
ggml_metal_library_t lib = ctx->lib;
ggml_metal_encoder_t enc = ctx->enc;
GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne);
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
GGML_TENSOR_LOCALS(int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_kargs_diag args = {
/*.ne00 =*/ne00,
/*.ne01 =*/ne01,
/*.ne02 =*/ne02,
/*.ne03 =*/ne03,
/*.nb00 =*/nb00,
/*.nb01 =*/nb01,
/*.nb02 =*/nb02,
/*.nb03 =*/nb03,
/*.ne0 =*/ne0,
/*.ne1 =*/ne1,
/*.ne2 =*/ne2,
/*.ne3 =*/ne3,
/*.nb0 =*/nb0,
/*.nb1 =*/nb1,
/*.nb2 =*/nb2,
/*.nb3 =*/nb3,
};
auto pipeline = ggml_metal_library_get_pipeline_diag(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1);
ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 2);
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, 32, 1, 1);
return 1;
}
int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);

View file

@ -56,6 +56,7 @@ int ggml_metal_op_sum_rows (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_cumsum (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);

View file

@ -8815,6 +8815,26 @@ kernel void kernel_set_rows_f(
}
}
kernel void kernel_diag_f32(
constant ggml_metal_kargs_diag & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiitg[[thread_index_in_threadgroup]]) {
constexpr short NW = N_SIMDWIDTH;
const int32_t i3 = tgpig.z;
const int32_t i2 = tgpig.y;
const int32_t i1 = tgpig.x;
device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03);
device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3);
for (int i0 = tiitg; i0 < args.ne0; i0 += NW) {
dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f;
}
}
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];

View file

@ -3220,9 +3220,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
const uint32_t D_lsb = D ^ (D & (D-1));
uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);
// Nvidia prefers shared memory use to load large tiles of K
// Nvidia prefers shared memory use to load large tiles of K.
// Switch to loading from global memory when it would use too much shared memory.
// AMD prefers loading K directly from global memory
const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA ? 1 : 0;
const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0;
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem};
};
@ -5590,9 +5591,9 @@ static void ggml_vk_instance_init() {
// Check if there are two physical devices corresponding to the same GPU
// This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux),
// see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication.
// However, for MoltenVK on macOS, multiple GPUs on the same card may report the same UUID,
// see https://github.com/KhronosGroup/MoltenVK/issues/2683. Until this is fixed, we'll only deduplicate
// when drivers differ (same driver + same UUID = likely different GPUs)
// MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards,
// see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new
// driver is MoltenVK
auto old_device = std::find_if(
vk_instance.device_indices.begin(),
vk_instance.device_indices.end(),
@ -5609,11 +5610,9 @@ static void ggml_vk_instance_init() {
old_id.deviceLUIDValid && new_id.deviceLUIDValid &&
std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID))
);
bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk);
// Only deduplicate if same UUID AND different drivers
// (same driver + same UUID on MoltenVK = likely different GPUs on multi-GPU card)
bool different_driver = (old_driver.driverID != new_driver.driverID);
return same_uuid && different_driver;
return same_uuid && !both_molten_vk;
}
);
if (old_device == vk_instance.device_indices.end()) {
@ -8450,7 +8449,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co
const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br;
const uint32_t sfsh = Bc * sfshstride * acctype;
const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA;
const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256;
const uint32_t kshstride = (k_load_shmem ? hsk_pad : MatBr) / 4 + 2;
const uint32_t vsh_stride = MatBc / 4 * row_split;
const uint32_t ksh = ((kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)) * f16vec4;

View file

@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version")
set(BORINGSSL_VERSION "0.20260204.0" CACHE STRING "BoringSSL version")
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")