// Adapted from // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi // // Copyright 2024 Mozilla Foundation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "sgemm.h" // #include // #include // #include #include // #include #include // #include "llamafile.h" static const struct GemmFuncs { bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); // typeof(llamafile_sgemm)* sgemm; // typeof(llamafile_mixmul)* mixmul; // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; GemmFuncs() { #if defined(__x86_64__) || defined(_M_X64) // if (X86_HAVE(AVX)) { // if (X86_HAVE(FMA)) { // if (X86_HAVE(AVX2)) { // if (X86_HAVE(AVX512F)) { // if (X86_HAVE(AVX512VL) && // // X86_HAVE(AVX512BW) && // // X86_HAVE(AVX512DQ) && // // X86_HAVE(AVX512_VNNI) && // // X86_HAVE(AVX512_BF16)) { // // AMD Zen4+ (2023-) // sgemm = llamafile_sgemm_amd_zen4; // mixmul = llamafile_mixmul_amd_zen4; // iqk_mixmul = iqk_mul_mat_moe_zen4; // } else { // // Intel Xeon Skylake+ (2015-) // sgemm = llamafile_sgemm_amd_avx512f; // mixmul = llamafile_mixmul_amd_avx512f; // iqk_mixmul = iqk_mul_mat_moe; // } // } else if (X86_HAVE(AVXVNNI)) { // // Intel Alderlake (2021-) // sgemm = llamafile_sgemm_amd_avxvnni; // mixmul = llamafile_mixmul_amd_avxvnni; // iqk_mixmul = iqk_mul_mat_moe; // } else { // // Intel Haswell/Broadwell/Skylake (2013-2020) // // AMD Excavator (2015-2022) // sgemm = llamafile_sgemm_amd_avx2; // mixmul = llamafile_mixmul_amd_avx2; // if (X86_HAVE(F16C)) // iqk_mixmul = iqk_mul_mat_moe; // } // } else { // // AMD Piledriver (2011-2014) // sgemm = llamafile_sgemm_amd_fma; // mixmul = llamafile_mixmul_amd_fma; // if (X86_HAVE(F16C)) // iqk_mixmul = iqk_mul_mat_moe; // } // } else { // // Intel Sandybridge/Ivybridge (2010-2012) // // AMD Bulldozer (2011) // sgemm = llamafile_sgemm_amd_avx; // mixmul = llamafile_mixmul_amd_avx; // } // } else { // // AMD K8/Barcelona (2003-2010) // // Intel Core/Nehalem (2006-2009) // sgemm = llamafile_sgemm_unsupported; // mixmul = llamafile_mixmul_unsupported; // } #if defined(__AVX__) #if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) #if defined(__AVX2__) #if defined(__AVX512F__) printf("__AVX512F__\n"); #if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) // AMD Zen4+ (2023-) sgemm = llamafile_sgemm_amd_zen4; mixmul = llamafile_mixmul_amd_zen4; iqk_mixmul = iqk_mul_mat_moe_zen4; #else // Intel Xeon Skylake+ (2015-) sgemm = llamafile_sgemm_amd_avx512f; mixmul = llamafile_mixmul_amd_avx512f; iqk_mixmul = iqk_mul_mat_moe; #endif #elif defined(__AVXVNNI__) // Intel Alderlake (2021-) sgemm = llamafile_sgemm_amd_avxvnni; mixmul = llamafile_mixmul_amd_avxvnni; iqk_mixmul = iqk_mul_mat_moe; #else // Intel Haswell/Broadwell/Skylake (2013-2020) // AMD Excavator (2015-2022) sgemm = llamafile_sgemm_amd_avx2; mixmul = llamafile_mixmul_amd_avx2; #if defined(__F16C__) iqk_mixmul = iqk_mul_mat_moe; #endif #endif #else // AMD Piledriver (2011-2014) sgemm = llamafile_sgemm_amd_fma; mixmul = llamafile_mixmul_amd_fma; #if defined(__F16C__) iqk_mixmul = iqk_mul_mat_moe; #endif #endif #else // Intel Sandybridge/Ivybridge (2010-2012) // AMD Bulldozer (2011) sgemm = llamafile_sgemm_amd_avx; mixmul = llamafile_mixmul_amd_avx; #endif #else // AMD K8/Barcelona (2003-2010) // Intel Core/Nehalem (2006-2009) sgemm = llamafile_sgemm_unsupported; mixmul = llamafile_mixmul_unsupported; #endif #elif defined(__aarch64__) long hwcap = getauxval(AT_HWCAP); if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1) (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1) (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1) // e.g. Apple M1, Raspberry Pi 5 sgemm = llamafile_sgemm_arm82; mixmul = llamafile_mixmul_arm82; iqk_mixmul = iqk_mul_mat_moe_arm82; } else { // ARM64 baseline ISA sgemm = llamafile_sgemm_arm80; mixmul = llamafile_mixmul_arm80; } #else sgemm = llamafile_sgemm_unsupported; mixmul = llamafile_mixmul_unsupported; #endif } } funcs; /** * Performs optimized matrix multiplication on CPU. * * This subroutine may compute C = Aᵀ * B with column major ordering. * Despite its name, this isn't a generalized implementation. Work is * only performed when a handwritten kernel is written and available. * Otherwise the caller should fall back to a general matmul routine. * * @param m is rows in `A` and `C` * @param n is cols in `B` and `C` * @param k is cols in `A` and rows in `B` * @param A is first input matrix (always transposed) * @param lda is row stride of `A` * @param B is second input matrix (never transposed) * @param ldb is row stride of `B` * @param C is input/output array of output matrices * @param ldc is row stride of `C` * @param ith is thread id (must be less than `nth`) * @param nth is number of threads (must be greater than zero) * @param task is GGML task type * @param Atype is GGML data type of `A` * @param Btype is GGML data type of `B` * @param Ctype is GGML data type of `C` * @param precision may be used to control the internal compute type * @return true if this function was able to service the matmul request */ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype, precision); } /** * Performs "mixture of experts" tensor multiplication on CPU. */ bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) { return funcs.mixmul(params, weights, thought, plan, result); } bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) { return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth); }