mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-06 04:30:03 +00:00
fix #829: 兼容Intel Cascade Lake架构的CPU
This commit is contained in:
parent
407e1b9ab2
commit
26bd889ff8
1 changed files with 23 additions and 2 deletions
23
third_party/llamafile/iqk_mul_mat.inc
vendored
23
third_party/llamafile/iqk_mul_mat.inc
vendored
|
@ -2385,7 +2385,12 @@ struct SimpleBits {
|
||||||
__m256i values[4];
|
__m256i values[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// fix for #829: 添加对 AVX512VPOPCNTDQ 的检测
|
||||||
|
#if defined(HAVE_FANCY_SIMD) && defined(__AVX512VPOPCNTDQ__)
|
||||||
|
#define HAVE_AVX512_POPCNT 1
|
||||||
|
#else
|
||||||
|
#define HAVE_AVX512_POPCNT 0
|
||||||
|
#endif
|
||||||
|
|
||||||
struct EvenSignHelper {
|
struct EvenSignHelper {
|
||||||
#if defined HAVE_FANCY_SIMD
|
#if defined HAVE_FANCY_SIMD
|
||||||
|
@ -2396,7 +2401,23 @@ struct EvenSignHelper {
|
||||||
};
|
};
|
||||||
IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
|
IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
|
||||||
aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
|
aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
|
||||||
|
|
||||||
|
// fix for #829: 兼容Intel Cascade Lake架构的CPU,如果不支持AVX512VPOPCNTDQ扩展,则使用替代实现
|
||||||
|
#if HAVE_AVX512_POPCNT
|
||||||
auto pcnt = _mm256_popcnt_epi32(aux);
|
auto pcnt = _mm256_popcnt_epi32(aux);
|
||||||
|
|
||||||
|
#else
|
||||||
|
// 提供替代实现,使用标准的位计数方法
|
||||||
|
__m256i pcnt;
|
||||||
|
int* pcnt_ptr = reinterpret_cast<int*>(&pcnt);
|
||||||
|
int* aux_ptr = reinterpret_cast<int*>(&aux); // 直接获取 aux 的地址,避免不必要的复制
|
||||||
|
|
||||||
|
#pragma unroll 8 // 提示编译器展开循环,提高 SIMD 计算吞吐量
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
pcnt_ptr[i] = __builtin_popcount(aux_ptr[i]); // 使用编译器内置 popcount
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
sbits_t sbits;
|
sbits_t sbits;
|
||||||
sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
|
sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
|
||||||
values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
|
values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
|
||||||
|
|
Loading…
Add table
Reference in a new issue