From 26bd889ff8b4cf83117cef87797e35a8d81a4e2c Mon Sep 17 00:00:00 2001 From: "liu.shen" Date: Sun, 9 Mar 2025 19:26:12 +0800 Subject: [PATCH] =?UTF-8?q?fix=20#829:=20=E5=85=BC=E5=AE=B9Intel=20Cascade?= =?UTF-8?q?=20Lake=E6=9E=B6=E6=9E=84=E7=9A=84CPU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/llamafile/iqk_mul_mat.inc | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index 99ee537..35bc7b8 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -2385,7 +2385,12 @@ struct SimpleBits { __m256i values[4]; }; - +// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测 +#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__) +#define HAVE_AVX512_POPCNT 1 +#else +#define HAVE_AVX512_POPCNT 0 +#endif struct EvenSignHelper { #if defined HAVE_FANCY_SIMD @@ -2396,7 +2401,23 @@ struct EvenSignHelper { }; IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const { aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask); - auto pcnt = _mm256_popcnt_epi32(aux); + + // fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现 + #if HAVE_AVX512_POPCNT + auto pcnt = _mm256_popcnt_epi32(aux); + + #else + // 提供替代实现,使用标准的位计数方法 + __m256i pcnt; + int* pcnt_ptr = reinterpret_cast(&pcnt); + int* aux_ptr = reinterpret_cast(&aux); // 直接获取 aux 的地址,避免不必要的复制 + + #pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量 + for (int i = 0; i < 8; i++) { + pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount + } + #endif + sbits_t sbits; sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7))); values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);