diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt index 02e6a04..591d72f 100644 --- a/ktransformers/ktransformers_ext/CMakeLists.txt +++ b/ktransformers/ktransformers_ext/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.17) project(cpuinfer_ext VERSION 0.1.0) set(CMAKE_CXX_STANDARD 17) @@ -190,7 +190,13 @@ else() message(STATUS "Unknown architecture") endif() -find_package(CUDA REQUIRED) +# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}") +# find_package(FindCUDAToolkit REQUIRED) +# if(CUDAToolkit_FOUND) +# message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}") +# else() +# message(STATUS "Can't found CUDA lib") +# endif() add_compile_options("$<$:${ARCH_FLAGS}>") add_compile_options("$<$:${ARCH_FLAGS}>") @@ -199,7 +205,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party) -include_directories("${CUDA_INCLUDE_DIRS}") +include_directories("D:/CUDA/v12.5/include") aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2) @@ -210,4 +216,4 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}") pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES}) target_link_libraries(${PROJECT_NAME} PRIVATE llama) -target_link_libraries(${PROJECT_NAME} PRIVATE "/usr/local/cuda/lib64/libcudart.so") \ No newline at end of file +target_link_libraries(${PROJECT_NAME} PRIVATE "D:/CUDA/v12.5/lib/x64/cudart.lib")#CUDA::cudart \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h index b4212fd..cb912be 100644 --- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h +++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h @@ -17,6 +17,44 @@ #include #include #include +#ifdef _WIN32 +#include +#endif + +class custom_mutex { +private: +#ifdef _WIN32 + HANDLE global_mutex; +#elif + std::mutex global_mutex; +#endif + +public: + custom_mutex() + { +#ifdef _WIN32 + HANDLE global_mutex; +#endif + } + + void lock() + { +#ifdef _WIN32 + WaitForSingleObject(global_mutex, INFINITE); +#elif + global_mutex.lock(); +#endif + } + + void unlock() + { +#ifdef _WIN32 + ReleaseMutex(global_mutex); +#elif + global_mutex.lock(); +#endif + } +}; class TaskQueue { public: @@ -32,7 +70,7 @@ class TaskQueue { std::queue> tasks; std::thread worker; - std::mutex mutex; + custom_mutex mutex; std::atomic sync_flag; std::atomic exit_flag; }; diff --git a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu index e8e5153..9205d3b 100644 --- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu +++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu @@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, thread_m_blocks = exec_cfg.max_m_blocks; } + + // Define kernel configurations - if (false) { +#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \ + str(prob_n) + ", " + str(prob_k) + "]" + \ + ", has_act_order = " + str(has_act_order) + \ + ", num_groups = " + str(num_groups) + \ + ", group_size = " + str(group_size) + \ + ", thread_m_blocks = " + str(thread_m_blocks) + \ + ", thread_n_blocks = " + str(thread_n_blocks) + \ + ", thread_k_blocks = " + str(thread_k_blocks)); + + + if (num_bits == 4 && num_threads == 256) + { + if (false) { + } + CALL_IF(4, 32, 2, 256) + CALL_IF(4, 16, 4, 256) + CALL_IF(4, 8, 8, 256) + else { + undefined_error + } + } + else if (num_bits == 4 && num_threads == 128) + { + if (false) { + } + CALL_IF(4, 8, 4, 128) + CALL_IF(4, 4, 8, 128) + else { + undefined_error + } + } + else if (num_bits == 8 && num_threads == 256) + { + if (false) { + } + CALL_IF(8, 32, 2, 256) + CALL_IF(8, 16, 4, 256) + CALL_IF(8, 8, 8, 256) + else { + undefined_error + } + } + else if (num_bits == 8 && num_threads == 128) + { + if (false) { + } + CALL_IF(8, 8, 4, 128) + CALL_IF(8, 4, 8, 128) + else { + undefined_error + } } - CALL_IF(4, 32, 2, 256) - CALL_IF(4, 16, 4, 256) - CALL_IF(4, 8, 8, 256) - CALL_IF(4, 8, 4, 128) - CALL_IF(4, 4, 8, 128) - CALL_IF(8, 32, 2, 256) - CALL_IF(8, 16, 4, 256) - CALL_IF(8, 8, 8, 256) - CALL_IF(8, 8, 4, 128) - CALL_IF(8, 4, 8, 128) else { - TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + - str(prob_n) + ", " + str(prob_k) + "]" + - ", has_act_order = " + str(has_act_order) + - ", num_groups = " + str(num_groups) + - ", group_size = " + str(group_size) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); + undefined_error } A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; diff --git a/ktransformers/ktransformers_ext/cuda/setup.py b/ktransformers/ktransformers_ext/cuda/setup.py index baf0808..7ccf9ee 100644 --- a/ktransformers/ktransformers_ext/cuda/setup.py +++ b/ktransformers/ktransformers_ext/cuda/setup.py @@ -10,7 +10,7 @@ setup(name='KTransformersOps', 'custom_gguf/dequant.cu', 'binding.cpp', 'gptq_marlin/gptq_marlin.cu', - # 'gptq_marlin_repack.cu', + # 'gptq_marlin_repack.cu', ]) ], cmdclass={'build_ext': BuildExtension diff --git a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp index 632c210..979af5e 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp +++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp @@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) { int nth = config_.intermediate_size / config_.stride; backend->do_work_stealing_job(nth, [&](int task_id) { int ith = task_id; - void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); + void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); float* gate_output_ptr = gate_output_.data() + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); - void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); + void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); float* up_output_ptr = up_output_.data() + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { @@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) { } if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) { float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride; - void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); + void* down_input_ptr = (uint8_t*)down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type); } }); @@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) { nth = config_.hidden_size / config_.stride; backend->do_work_stealing_job(nth, [&](int task_id) { int ith = task_id; - void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); + void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); float* down_output_ptr = down_output_.data() + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) { - void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type); } }); diff --git a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp index aaea4a7..3a5c852 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp +++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp @@ -9,9 +9,9 @@ **/ #include "moe.h" #include -#include "unistd.h" +#include -void* MOE::buffer_ = nullptr; +uint8_t* MOE::buffer_ = nullptr; MOE::MOE(MOEConfig config) { config_ = config; @@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) { buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type); buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size; buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size; - buffer_ = malloc(buffer_size); + buffer_ = (uint8_t*)malloc(buffer_size); } uint64_t offset = 0; @@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) { m_local_pos_.resize(config_.group_max_len); for (int i = 0; i < config_.group_max_len; i++) { - m_local_pos_[i].reserve(config_.expert_num); + m_local_pos_[i].resize(config_.routed_expert_num); } m_local_num_.resize(config_.expert_num); m_local_gate_input_ptr_.resize(config_.expert_num); @@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c int expert_idx = task_id / nth; uint64_t expert_id = expert_ids[expert_idx]; int ith = task_id % nth; - void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); + void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); - void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); + void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { @@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c } for (int expert_idx = 0; expert_idx < k; expert_idx++) { uint64_t expert_id = expert_ids[expert_idx]; - void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); + void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride; llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { @@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c } if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) { float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride; - void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type); } }); @@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* const void* gate_input_ptr; const void* up_input_ptr; if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { - gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); } else { - to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type); + to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type); if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); gate_input_ptr = up_input_ptr = m_gate_input_[i]; @@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); gate_input_ptr = m_gate_input_[i]; } else { - gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); } if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type); up_input_ptr = m_up_input_[i]; } else { - up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); + up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); } } } @@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* int expert_idx = task_id / nth; int ith = task_id % nth; void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx]; - void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); + void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride; llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); void* up_input_ptr = m_local_up_input_ptr_[expert_idx]; - void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); + void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride; llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); for (int i = 0; i < m_local_num_[expert_idx]; i++) { @@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* int expert_idx = task_id / nth; int ith = task_id % nth; void* down_input_ptr = m_local_down_input_ptr_[expert_idx]; - void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); + void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride; llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); }); @@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j]; } } - from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type); + from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type); }); } void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) { if (qlen < config_.group_min_len) { for (int i = 0; i < qlen; i++) { - forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); + forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); } return; } int forward_len = std::min(config_.group_max_len, qlen); forward_many(forward_len, k, expert_ids, weights, input, output, backend); - forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); + forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); } \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/operators/llamafile/moe.h b/ktransformers/ktransformers_ext/operators/llamafile/moe.h index 0d279fe..588a829 100644 --- a/ktransformers/ktransformers_ext/operators/llamafile/moe.h +++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.h @@ -54,7 +54,7 @@ class MOE { void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend); private: - static void* buffer_; + static uint8_t* buffer_; MOEConfig config_; void* gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)] void* up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)] diff --git a/ktransformers/local_chat.py b/ktransformers/local_chat.py index 59839be..2b2c8a7 100644 --- a/ktransformers/local_chat.py +++ b/ktransformers/local_chat.py @@ -112,4 +112,4 @@ def local_chat( generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens) if __name__ == "__main__": - fire.Fire(local_chat) + fire.Fire(local_chat) \ No newline at end of file diff --git a/ktransformers/models/custom_cache.py b/ktransformers/models/custom_cache.py index 385e6ec..93fc4c7 100644 --- a/ktransformers/models/custom_cache.py +++ b/ktransformers/models/custom_cache.py @@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache): self.value_cache: List[torch.Tensor] = [] cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) if config.architectures[0] == "DeepseekV2ForCausalLM": + # TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim) # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim) key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim) diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 6adb657..bc44342 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -19,7 +19,8 @@ import torch import sys, os from ktransformers.operators.base_operator import BaseInjectedModule -sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build") +#sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build/") +sys.path.append(os.path.dirname(__file__) + "\\..\\ktransformers_ext\\build\\Release") import cpuinfer_ext from cpuinfer_ext.moe import MOEConfig, MOE import ctypes @@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase): def forward(self, input_tensor, expert_ids, weights): # generate, capture and run cuda graph if input_tensor.size(0)==1: + # TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible #print("capturing experts") MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True) MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True) @@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase): self.down = None def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor: - + # TODO: forward should transfer data to gpu, and make the data transfering capturable using pin memory, + # just like CPUInfer MLPCPUExperts. There may be a base class of experts on cpu + hidden_states_cpu = hidden_states_cpu.to("cpu") + selected_experts_cpu = selected_experts_cpu.to("cpu") + routing_weights_cpu = routing_weights_cpu.to("cpu") + batch_sequence_length, hidden_dim = hidden_states_cpu.size() final_hidden_states = torch.zeros( @@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE): topk_idx, topk_weight, aux_loss = self.gate(hidden_states) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - if sequence_length == 1: + if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"): self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0]) if self.config.n_shared_experts is not None: y_ = self.shared_experts(identity).squeeze(0) diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml index 025bd2b..18efd60 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml @@ -26,7 +26,7 @@ prefill_device: "cuda" prefill_mlp_type: "MLPExpertsTorch" generate_device: "cpu" - generate_mlp_type: "MLPCPUExperts" + generate_mlp_type: "MLPCPUExperts" out_device: "cuda" recursive: False # don't recursively inject submodules of this module - match: diff --git a/ktransformers/util/custom_gguf.py b/ktransformers/util/custom_gguf.py index 643713e..8b00dca 100644 --- a/ktransformers/util/custom_gguf.py +++ b/ktransformers/util/custom_gguf.py @@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54 Version : 1.0.0 LastEditors : Azure LastEditTime : 2024-07-26 09:28:25 +Adapted from https://github.com/99991/pygguf/blob/main/gguf.py +Copyright (c) 2023-2024 The ggml authors +Copyright (c) 2024 Thomas Germer Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ''' # copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf @@ -95,7 +98,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization GGML_TYPES = { "F32": 0, - "F16": 1, + "Q4_0": 2, + "Q5_0": 6, "Q8_0": 8, "Q2_K": 10, "Q3_K": 11, @@ -108,7 +112,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()} GGML_BLOCK_SIZES = { "F32": 4, - "F16": 2, + "Q4_0": 2 + 16, + "Q5_0": 2 + 4 + 16, "Q8_0": 2 + 32, "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, "Q3_K": 256 // 8 + 256 // 4 + 12 + 2, @@ -119,7 +124,8 @@ GGML_BLOCK_SIZES = { GGML_ELEMENTS_PER_BLOCK = { "F32": 1, - "F16": 1, + "Q4_0": 32, + "Q5_0": 32, "Q8_0": 32, "Q2_K": 256, "Q3_K": 256, @@ -128,14 +134,6 @@ GGML_ELEMENTS_PER_BLOCK = { "Q6_K": 256, } -# DATA_TYPES = { -# "uint32": 4, -# "int32": 5, -# "float32": 6, -# "string": 8, -# "array": 9, -# "uint64": 10, -# } DATA_TYPES = { "uint8": 0, "int8": 1, @@ -272,7 +270,7 @@ class GGUFLoader: def load_gguf_tensor(self, name: str, device:str = "cpu")->torch.Tensor: t = self.tensor_info[name] - + shape = t["shape"] ggml_type = t["ggml_type"] @@ -282,10 +280,12 @@ class GGUFLoader: ggml_name = GGML_NAMES[ggml_type] data = self.get_mmap_tensor(name) - if "cuda" in device.lower(): values = GGML_DEQUANTIZE_GPU[ggml_name](data, device) + #values = GGML_DEQUANTIZE[ggml_name](data) + #print("load_gguf_tensor") + #values = torch.from_numpy(values).to(device = device) else: values = GGML_DEQUANTIZE[ggml_name](data) values = torch.from_numpy(values) @@ -375,7 +375,7 @@ def dequantize_q2_k(data): return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) def dequantize_q2_k_gpu(data): - pass + raise NotImplementedError() def dequantize_q3_k(data): # C implementation @@ -420,7 +420,7 @@ def dequantize_q3_k(data): ], axis=1) def dequantize_q3_k_gpu(data): - pass + raise NotImplementedError() def dequantize_q4_k(data): # C implementation @@ -429,20 +429,16 @@ def dequantize_q4_k(data): # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116 block_size = GGML_BLOCK_SIZES["Q4_K"] num_blocks = len(data) // block_size - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) - # Casting to float32 because float16 is very slow on CPU scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32) scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32) qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1) qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32) - # Dequantize scales and offsets (6 bits and 4 + 2 bits) factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1) offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1) - # Interleave low and high quantized bits qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32) # Dequantize final weights using scales and offsets @@ -513,7 +509,7 @@ def dequantize_q5_k(data): ], axis=1) def dequantize_q5_k_gpu(data): - pass + raise NotImplementedError() def dequantize_q6_k(data): @@ -573,6 +569,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"): data = torch.from_numpy(data) return KTransformersOps.dequantize_q6_k(data, 210, device) +def dequantize_q4_0(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515 + # C struct definition + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141 + num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32) + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:] + + return np.concatenate([ + scales * ((qs & 0xf).astype(np.int8) - 8), + scales * ((qs >> 4).astype(np.int8) - 8), + ], axis=1) + +def dequantize_q4_0_gpu(data): + raise NotImplementedError() + +def dequantize_q5_0(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556 + # C struct definition + # https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161 + num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32) + qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4] + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:] + + bits = np.unpackbits(qh, axis=-1, bitorder="little") + + x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16 + x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16 + + return np.concatenate([ + scales * x0, + scales * x1, + ], axis=1) + +def dequantize_q5_0_gpu(data): + raise NotImplementedError() + def dequantize_q8_0(data): # C struct definition # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43 @@ -615,6 +653,8 @@ def dequantize_f16_gpu(data, device): GGML_DEQUANTIZE = { "F32": dequantize_f32, "F16": dequantize_f16, + "Q4_0": dequantize_q4_0, + "Q5_0": dequantize_q5_0, "Q8_0": dequantize_q8_0, "Q2_K": dequantize_q2_k, "Q3_K": dequantize_q3_k, @@ -626,6 +666,8 @@ GGML_DEQUANTIZE = { GGML_DEQUANTIZE_GPU = { "F32": dequantize_f32_gpu, "F16": dequantize_f16_gpu, + "Q4_0": dequantize_q4_0_gpu, + "Q5_0": dequantize_q5_0_gpu, "Q8_0": dequantize_q8_0_gpu, "Q2_K": dequantize_q2_k_gpu, "Q3_K": dequantize_q3_k_gpu, diff --git a/ktransformers/util/utils.py b/ktransformers/util/utils.py index 7976e56..b5ac573 100644 --- a/ktransformers/util/utils.py +++ b/ktransformers/util/utils.py @@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000): logits = cuda_graph_runner(cur_token, position_ids, cache_position) past_key_values.change_seq_length(1) """ + inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda") + custom_stream = torch.cuda.Stream() with torch.cuda.stream(custom_stream): - logits=model(cur_token, - position_ids=position_ids, - cache_position=cache_position, - past_key_values=past_key_values, - return_dict=False, use_cache=True)[0] - #""" + logits=model(inputs_embeds = inputs_embeds, + position_ids = position_ids, + cache_position = cache_position, + past_key_values = past_key_values, + return_dict = False, use_cache = True) [0] + """ torch.cuda.synchronize() #print(logits) next_token_scores = logits_warper(inputs, logits[:, -1, :]) @@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000): generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int) past_key_values.cur_idx=cache_position start_time = time.time() - #custom_stream = torch.cuda.Stream() inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda") logits = model( diff --git a/setup.py b/setup.py index 1b2d3cf..e97c7cc 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,8 @@ class VersionInfo: """ if sys.platform.startswith("linux"): return f'linux_{platform.uname().machine}' + elif sys.platform == "win32": + return "win_amd64" else: raise ValueError("Unsupported platform: {}".format(sys.platform)) @@ -97,6 +99,8 @@ class VersionInfo: return 'avx2' raise ValueError( "Unsupported cpu Instructions: {}".format(flags_line)) + elif sys.platform == "win32": + return 'native' else: raise ValueError("Unsupported platform: {}".format(sys.platform)) diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc index 150a8f9..5e9d688 100644 --- a/third_party/llamafile/iqk_mul_mat.inc +++ b/third_party/llamafile/iqk_mul_mat.inc @@ -22,7 +22,7 @@ #include #include -#if defined __x86_64__ || defined __aarch64__ +#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64) #include "llama.cpp/ggml-impl.h" #include "llama.cpp/ggml-quants.h" @@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi return true; } -#if defined __x86_64__ +#if defined __x86_64__ || defined(_M_X64) #if defined HAVE_FANCY_SIMD #undef HAVE_FANCY_SIMD @@ -1412,7 +1412,8 @@ template void MulMat::set_functions(MulMat& m) { bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) { - row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); + if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0) + row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00); switch (typeA) { case GGML_TYPE_Q2_K: diff --git a/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp b/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp index 9e3de18..bfd12da 100644 --- a/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp +++ b/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp @@ -3,6 +3,6 @@ // Copyrigth 2024 Iwan Kawrakow. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include "iqk_mul_mat.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp b/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp index 4d0a979..f0f439f 100644 --- a/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp +++ b/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Iwan Kawrakow. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define iqk_mul_mat iqk_mul_mat_zen4 #define iqk_mul_mat_moe iqk_mul_mat_moe_zen4 #include "iqk_mul_mat.inc" diff --git a/third_party/llamafile/sgemm.cpp b/third_party/llamafile/sgemm.cpp index 7ec34ff..6a7cab4 100644 --- a/third_party/llamafile/sgemm.cpp +++ b/third_party/llamafile/sgemm.cpp @@ -22,19 +22,22 @@ #include "sgemm.h" // #include -#include +// #include // #include #include -#include +// #include #include // #include "llamafile.h" static const struct GemmFuncs { - typeof(llamafile_sgemm)* sgemm; - typeof(llamafile_mixmul)* mixmul; - typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; + bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int); + bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*); + bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int); + // typeof(llamafile_sgemm)* sgemm; + // typeof(llamafile_mixmul)* mixmul; + // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported; GemmFuncs() { -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) // if (X86_HAVE(AVX)) { // if (X86_HAVE(FMA)) { // if (X86_HAVE(AVX2)) { @@ -86,10 +89,12 @@ static const struct GemmFuncs { // sgemm = llamafile_sgemm_unsupported; // mixmul = llamafile_mixmul_unsupported; // } + #if defined(__AVX__) -#if defined(__FMA__) +#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) #if defined(__AVX2__) #if defined(__AVX512F__) + printf("__AVX512F__\n"); #if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) // AMD Zen4+ (2023-) sgemm = llamafile_sgemm_amd_zen4; diff --git a/third_party/llamafile/tinyblas_cpu.h b/third_party/llamafile/tinyblas_cpu.h index f361c0c..962c47c 100644 --- a/third_party/llamafile/tinyblas_cpu.h +++ b/third_party/llamafile/tinyblas_cpu.h @@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e) } #endif -#if defined(__FMA__) +#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) template <> inline __m256 madd(__m256 a, __m256 b, __m256 c) { diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp index 255f873..5cbf5df 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_avx #include "tinyblas_cpu_mixmul.inc" diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp index 552d1aa..95d44bf 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_avx2 #include "tinyblas_cpu_mixmul.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp index b5e5183..82ab637 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_avx512f #include "tinyblas_cpu_mixmul.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp index c2b2790..2726ac8 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_avxvnni #include "tinyblas_cpu_mixmul.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp index 6fd25c9..4d4c4d8 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_fma #include "tinyblas_cpu_mixmul.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp b/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp index aaac6e1..3d478c1 100644 --- a/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp +++ b/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_mixmul llamafile_mixmul_amd_zen4 #include "tinyblas_cpu_mixmul.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc index c9d1f47..634dc3e 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm.inc +++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc @@ -321,8 +321,8 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void assert(ith < nth); #if QK_K == 256 -#if defined(__x86_64__) -#if defined(__AVX2__) && defined(__FMA__) +#if defined(__x86_64__) || defined(_M_X64) +#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))) // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) { if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) { if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) { diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp index e57eda6..439e55d 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_avx #include "tinyblas_cpu_sgemm.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp index 0e1fe84..4b46f01 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_avx2 #include "tinyblas_cpu_sgemm.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp index cafcaa2..16425e4 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_avx512f #include "tinyblas_cpu_sgemm.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp index 5d2ddce..a4ac488 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_avxvnni #include "tinyblas_cpu_sgemm.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp index 275c9b4..e1559da 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_fma #include "tinyblas_cpu_sgemm.inc" #endif // __x86_64__ diff --git a/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp b/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp index 01924a7..f524ba1 100644 --- a/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp +++ b/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp @@ -3,7 +3,7 @@ // Copyrigth 2024 Mozilla Foundation. // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define llamafile_sgemm llamafile_sgemm_amd_zen4 #define iqk_mul_mat iqk_mul_mat_zen4 #include "tinyblas_cpu_sgemm.inc"