mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-10 15:29:39 +00:00
support windows support q4_0 and q5_0 dequant on cpu Add CopyRight from pygguf(It was added before, but disappear after merge). Add some TODO in the code.
This commit is contained in:
parent
442e13bc97
commit
0a2fd52cea
32 changed files with 248 additions and 108 deletions
|
@ -1,4 +1,4 @@
|
||||||
cmake_minimum_required(VERSION 3.16)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
project(cpuinfer_ext VERSION 0.1.0)
|
project(cpuinfer_ext VERSION 0.1.0)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
@ -190,7 +190,13 @@ else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
find_package(CUDA REQUIRED)
|
# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
|
||||||
|
# find_package(FindCUDAToolkit REQUIRED)
|
||||||
|
# if(CUDAToolkit_FOUND)
|
||||||
|
# message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
|
||||||
|
# else()
|
||||||
|
# message(STATUS "Can't found CUDA lib")
|
||||||
|
# endif()
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||||
|
@ -199,7 +205,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
|
||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llama.cpp ${CMAKE_CURRENT_BINARY_DIR}/third_party/llama.cpp)
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party)
|
||||||
include_directories("${CUDA_INCLUDE_DIRS}")
|
include_directories("D:/CUDA/v12.5/include")
|
||||||
|
|
||||||
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
|
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR1)
|
||||||
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
|
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
|
||||||
|
@ -210,4 +216,4 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
|
||||||
|
|
||||||
pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
|
pybind11_add_module(${PROJECT_NAME} MODULE ${ALL_SOURCES})
|
||||||
target_link_libraries(${PROJECT_NAME} PRIVATE llama)
|
target_link_libraries(${PROJECT_NAME} PRIVATE llama)
|
||||||
target_link_libraries(${PROJECT_NAME} PRIVATE "/usr/local/cuda/lib64/libcudart.so")
|
target_link_libraries(${PROJECT_NAME} PRIVATE "D:/CUDA/v12.5/lib/x64/cudart.lib")#CUDA::cudart
|
|
@ -17,6 +17,44 @@
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class custom_mutex {
|
||||||
|
private:
|
||||||
|
#ifdef _WIN32
|
||||||
|
HANDLE global_mutex;
|
||||||
|
#elif
|
||||||
|
std::mutex global_mutex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
public:
|
||||||
|
custom_mutex()
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
HANDLE global_mutex;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void lock()
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
WaitForSingleObject(global_mutex, INFINITE);
|
||||||
|
#elif
|
||||||
|
global_mutex.lock();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void unlock()
|
||||||
|
{
|
||||||
|
#ifdef _WIN32
|
||||||
|
ReleaseMutex(global_mutex);
|
||||||
|
#elif
|
||||||
|
global_mutex.lock();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
class TaskQueue {
|
class TaskQueue {
|
||||||
public:
|
public:
|
||||||
|
@ -32,7 +70,7 @@ class TaskQueue {
|
||||||
|
|
||||||
std::queue<std::function<void()>> tasks;
|
std::queue<std::function<void()>> tasks;
|
||||||
std::thread worker;
|
std::thread worker;
|
||||||
std::mutex mutex;
|
custom_mutex mutex;
|
||||||
std::atomic<bool> sync_flag;
|
std::atomic<bool> sync_flag;
|
||||||
std::atomic<bool> exit_flag;
|
std::atomic<bool> exit_flag;
|
||||||
};
|
};
|
||||||
|
|
|
@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
|
||||||
thread_m_blocks = exec_cfg.max_m_blocks;
|
thread_m_blocks = exec_cfg.max_m_blocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Define kernel configurations
|
// Define kernel configurations
|
||||||
|
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
|
||||||
|
str(prob_n) + ", " + str(prob_k) + "]" + \
|
||||||
|
", has_act_order = " + str(has_act_order) + \
|
||||||
|
", num_groups = " + str(num_groups) + \
|
||||||
|
", group_size = " + str(group_size) + \
|
||||||
|
", thread_m_blocks = " + str(thread_m_blocks) + \
|
||||||
|
", thread_n_blocks = " + str(thread_n_blocks) + \
|
||||||
|
", thread_k_blocks = " + str(thread_k_blocks));
|
||||||
|
|
||||||
|
|
||||||
|
if (num_bits == 4 && num_threads == 256)
|
||||||
|
{
|
||||||
if (false) {
|
if (false) {
|
||||||
}
|
}
|
||||||
CALL_IF(4, 32, 2, 256)
|
CALL_IF(4, 32, 2, 256)
|
||||||
CALL_IF(4, 16, 4, 256)
|
CALL_IF(4, 16, 4, 256)
|
||||||
CALL_IF(4, 8, 8, 256)
|
CALL_IF(4, 8, 8, 256)
|
||||||
|
else {
|
||||||
|
undefined_error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (num_bits == 4 && num_threads == 128)
|
||||||
|
{
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
CALL_IF(4, 8, 4, 128)
|
CALL_IF(4, 8, 4, 128)
|
||||||
CALL_IF(4, 4, 8, 128)
|
CALL_IF(4, 4, 8, 128)
|
||||||
|
else {
|
||||||
|
undefined_error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (num_bits == 8 && num_threads == 256)
|
||||||
|
{
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
CALL_IF(8, 32, 2, 256)
|
CALL_IF(8, 32, 2, 256)
|
||||||
CALL_IF(8, 16, 4, 256)
|
CALL_IF(8, 16, 4, 256)
|
||||||
CALL_IF(8, 8, 8, 256)
|
CALL_IF(8, 8, 8, 256)
|
||||||
|
else {
|
||||||
|
undefined_error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (num_bits == 8 && num_threads == 128)
|
||||||
|
{
|
||||||
|
if (false) {
|
||||||
|
}
|
||||||
CALL_IF(8, 8, 4, 128)
|
CALL_IF(8, 8, 4, 128)
|
||||||
CALL_IF(8, 4, 8, 128)
|
CALL_IF(8, 4, 8, 128)
|
||||||
else {
|
else {
|
||||||
TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
|
undefined_error
|
||||||
str(prob_n) + ", " + str(prob_k) + "]" +
|
}
|
||||||
", has_act_order = " + str(has_act_order) +
|
}
|
||||||
", num_groups = " + str(num_groups) +
|
else {
|
||||||
", group_size = " + str(group_size) +
|
undefined_error
|
||||||
", thread_m_blocks = " + str(thread_m_blocks) +
|
|
||||||
", thread_n_blocks = " + str(thread_n_blocks) +
|
|
||||||
", thread_k_blocks = " + str(thread_k_blocks));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
|
A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
|
||||||
|
|
|
@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
|
||||||
int nth = config_.intermediate_size / config_.stride;
|
int nth = config_.intermediate_size / config_.stride;
|
||||||
backend->do_work_stealing_job(nth, [&](int task_id) {
|
backend->do_work_stealing_job(nth, [&](int task_id) {
|
||||||
int ith = task_id;
|
int ith = task_id;
|
||||||
void* gate_proj_ptr = gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
||||||
float* gate_output_ptr = gate_output_.data() + ith * config_.stride;
|
float* gate_output_ptr = gate_output_.data() + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
void* up_proj_ptr = up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
void* up_proj_ptr = (uint8_t*)up_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
||||||
float* up_output_ptr = up_output_.data() + ith * config_.stride;
|
float* up_output_ptr = up_output_.data() + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
||||||
|
@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
|
||||||
}
|
}
|
||||||
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
|
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) == 0) {
|
||||||
float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride;
|
float* intermediate_fp32_ptr = intermediate_fp32_.data() + ith * config_.stride;
|
||||||
void* down_input_ptr = down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
void* down_input_ptr = (uint8_t*)down_input_.data() + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||||
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
|
||||||
nth = config_.hidden_size / config_.stride;
|
nth = config_.hidden_size / config_.stride;
|
||||||
backend->do_work_stealing_job(nth, [&](int task_id) {
|
backend->do_work_stealing_job(nth, [&](int task_id) {
|
||||||
int ith = task_id;
|
int ith = task_id;
|
||||||
void* down_proj_ptr = down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
||||||
float* down_output_ptr = down_output_.data() + ith * config_.stride;
|
float* down_output_ptr = down_output_.data() + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_.data(), config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
||||||
void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||||
from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type);
|
from_float(down_output_ptr, output_ptr, config_.stride, config_.hidden_type);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -9,9 +9,9 @@
|
||||||
**/
|
**/
|
||||||
#include "moe.h"
|
#include "moe.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include "unistd.h"
|
#include <cstdint>
|
||||||
|
|
||||||
void* MOE::buffer_ = nullptr;
|
uint8_t* MOE::buffer_ = nullptr;
|
||||||
|
|
||||||
MOE::MOE(MOEConfig config) {
|
MOE::MOE(MOEConfig config) {
|
||||||
config_ = config;
|
config_ = config;
|
||||||
|
@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
|
||||||
buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
buffer_size += config_.routed_expert_num * config_.group_max_len * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
|
||||||
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
|
buffer_size += sizeof(float) * config_.routed_expert_num * config_.group_max_len * config_.hidden_size;
|
||||||
buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
|
buffer_size += sizeof(float) * config_.group_max_len * config_.hidden_size;
|
||||||
buffer_ = malloc(buffer_size);
|
buffer_ = (uint8_t*)malloc(buffer_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t offset = 0;
|
uint64_t offset = 0;
|
||||||
|
@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
|
||||||
|
|
||||||
m_local_pos_.resize(config_.group_max_len);
|
m_local_pos_.resize(config_.group_max_len);
|
||||||
for (int i = 0; i < config_.group_max_len; i++) {
|
for (int i = 0; i < config_.group_max_len; i++) {
|
||||||
m_local_pos_[i].reserve(config_.expert_num);
|
m_local_pos_[i].resize(config_.routed_expert_num);
|
||||||
}
|
}
|
||||||
m_local_num_.resize(config_.expert_num);
|
m_local_num_.resize(config_.expert_num);
|
||||||
m_local_gate_input_ptr_.resize(config_.expert_num);
|
m_local_gate_input_ptr_.resize(config_.expert_num);
|
||||||
|
@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
|
||||||
int expert_idx = task_id / nth;
|
int expert_idx = task_id / nth;
|
||||||
uint64_t expert_id = expert_ids[expert_idx];
|
uint64_t expert_id = expert_ids[expert_idx];
|
||||||
int ith = task_id % nth;
|
int ith = task_id % nth;
|
||||||
void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
||||||
float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
|
float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
||||||
float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
|
float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
||||||
|
@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
|
||||||
}
|
}
|
||||||
for (int expert_idx = 0; expert_idx < k; expert_idx++) {
|
for (int expert_idx = 0; expert_idx < k; expert_idx++) {
|
||||||
uint64_t expert_id = expert_ids[expert_idx];
|
uint64_t expert_id = expert_ids[expert_idx];
|
||||||
void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
||||||
float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
|
float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
|
||||||
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
|
||||||
|
@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
|
||||||
}
|
}
|
||||||
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
|
||||||
float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
|
float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
|
||||||
void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||||
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
|
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
||||||
const void* gate_input_ptr;
|
const void* gate_input_ptr;
|
||||||
const void* up_input_ptr;
|
const void* up_input_ptr;
|
||||||
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||||
gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||||
} else {
|
} else {
|
||||||
to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
|
to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
|
||||||
if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||||
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||||
gate_input_ptr = up_input_ptr = m_gate_input_[i];
|
gate_input_ptr = up_input_ptr = m_gate_input_[i];
|
||||||
|
@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
||||||
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
|
||||||
gate_input_ptr = m_gate_input_[i];
|
gate_input_ptr = m_gate_input_[i];
|
||||||
} else {
|
} else {
|
||||||
gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||||
}
|
}
|
||||||
if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
|
||||||
from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
|
||||||
up_input_ptr = m_up_input_[i];
|
up_input_ptr = m_up_input_[i];
|
||||||
} else {
|
} else {
|
||||||
up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
||||||
int expert_idx = task_id / nth;
|
int expert_idx = task_id / nth;
|
||||||
int ith = task_id % nth;
|
int ith = task_id % nth;
|
||||||
void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
|
void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
|
||||||
void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
|
||||||
float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
|
float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
|
||||||
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
|
void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
|
||||||
void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
|
||||||
float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
|
float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
|
||||||
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
for (int i = 0; i < m_local_num_[expert_idx]; i++) {
|
for (int i = 0; i < m_local_num_[expert_idx]; i++) {
|
||||||
|
@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
||||||
int expert_idx = task_id / nth;
|
int expert_idx = task_id / nth;
|
||||||
int ith = task_id % nth;
|
int ith = task_id % nth;
|
||||||
void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
|
void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
|
||||||
void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
|
||||||
float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
|
float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
|
||||||
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
|
||||||
});
|
});
|
||||||
|
@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
||||||
m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
|
m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
|
from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
|
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
|
||||||
if (qlen < config_.group_min_len) {
|
if (qlen < config_.group_min_len) {
|
||||||
for (int i = 0; i < qlen; i++) {
|
for (int i = 0; i < qlen; i++) {
|
||||||
forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int forward_len = std::min(config_.group_max_len, qlen);
|
int forward_len = std::min(config_.group_max_len, qlen);
|
||||||
forward_many(forward_len, k, expert_ids, weights, input, output, backend);
|
forward_many(forward_len, k, expert_ids, weights, input, output, backend);
|
||||||
forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||||
}
|
}
|
|
@ -54,7 +54,7 @@ class MOE {
|
||||||
void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
|
void forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void* buffer_;
|
static uint8_t* buffer_;
|
||||||
MOEConfig config_;
|
MOEConfig config_;
|
||||||
void* gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
void* gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
||||||
void* up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
void* up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
|
||||||
|
|
|
@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
|
||||||
self.value_cache: List[torch.Tensor] = []
|
self.value_cache: List[torch.Tensor] = []
|
||||||
cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
|
cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
|
||||||
if config.architectures[0] == "DeepseekV2ForCausalLM":
|
if config.architectures[0] == "DeepseekV2ForCausalLM":
|
||||||
|
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
|
||||||
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
|
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
|
||||||
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
|
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
|
||||||
key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
|
key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
|
||||||
|
|
|
@ -19,7 +19,8 @@ import torch
|
||||||
import sys, os
|
import sys, os
|
||||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build")
|
#sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build/")
|
||||||
|
sys.path.append(os.path.dirname(__file__) + "\\..\\ktransformers_ext\\build\\Release")
|
||||||
import cpuinfer_ext
|
import cpuinfer_ext
|
||||||
from cpuinfer_ext.moe import MOEConfig, MOE
|
from cpuinfer_ext.moe import MOEConfig, MOE
|
||||||
import ctypes
|
import ctypes
|
||||||
|
@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
|
||||||
def forward(self, input_tensor, expert_ids, weights):
|
def forward(self, input_tensor, expert_ids, weights):
|
||||||
# generate, capture and run cuda graph
|
# generate, capture and run cuda graph
|
||||||
if input_tensor.size(0)==1:
|
if input_tensor.size(0)==1:
|
||||||
|
# TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
|
||||||
#print("capturing experts")
|
#print("capturing experts")
|
||||||
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
|
MLPCPUExperts.input_tensor_cpu.copy_(input_tensor, non_blocking=True)
|
||||||
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
|
MLPCPUExperts.expert_ids_cpu.copy_(expert_ids, non_blocking=True)
|
||||||
|
@ -359,6 +361,11 @@ class MLPExpertsTorch(MLPExpertsBase):
|
||||||
self.down = None
|
self.down = None
|
||||||
|
|
||||||
def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
|
def forward(self, hidden_states_cpu: torch.Tensor, selected_experts_cpu: torch.Tensor, routing_weights_cpu: torch.Tensor) -> torch.Tensor:
|
||||||
|
# TODO: forward should transfer data to gpu, and make the data transfering capturable using pin memory,
|
||||||
|
# just like CPUInfer MLPCPUExperts. There may be a base class of experts on cpu
|
||||||
|
hidden_states_cpu = hidden_states_cpu.to("cpu")
|
||||||
|
selected_experts_cpu = selected_experts_cpu.to("cpu")
|
||||||
|
routing_weights_cpu = routing_weights_cpu.to("cpu")
|
||||||
|
|
||||||
batch_sequence_length, hidden_dim = hidden_states_cpu.size()
|
batch_sequence_length, hidden_dim = hidden_states_cpu.size()
|
||||||
|
|
||||||
|
@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
|
||||||
topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
|
topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
|
||||||
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
|
||||||
|
|
||||||
if sequence_length == 1:
|
if sequence_length == 1 and hasattr(self.experts.generate_experts, "submit_for_one_decode"):
|
||||||
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
|
self.experts.generate_experts.submit_for_one_decode(hidden_states[0], topk_idx[0], topk_weight[0])
|
||||||
if self.config.n_shared_experts is not None:
|
if self.config.n_shared_experts is not None:
|
||||||
y_ = self.shared_experts(identity).squeeze(0)
|
y_ = self.shared_experts(identity).squeeze(0)
|
||||||
|
|
|
@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54
|
||||||
Version : 1.0.0
|
Version : 1.0.0
|
||||||
LastEditors : Azure
|
LastEditors : Azure
|
||||||
LastEditTime : 2024-07-26 09:28:25
|
LastEditTime : 2024-07-26 09:28:25
|
||||||
|
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
|
||||||
|
Copyright (c) 2023-2024 The ggml authors
|
||||||
|
Copyright (c) 2024 Thomas Germer
|
||||||
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
'''
|
'''
|
||||||
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
|
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
|
||||||
|
@ -95,7 +98,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
|
||||||
|
|
||||||
GGML_TYPES = {
|
GGML_TYPES = {
|
||||||
"F32": 0,
|
"F32": 0,
|
||||||
"F16": 1,
|
"Q4_0": 2,
|
||||||
|
"Q5_0": 6,
|
||||||
"Q8_0": 8,
|
"Q8_0": 8,
|
||||||
"Q2_K": 10,
|
"Q2_K": 10,
|
||||||
"Q3_K": 11,
|
"Q3_K": 11,
|
||||||
|
@ -108,7 +112,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
|
||||||
|
|
||||||
GGML_BLOCK_SIZES = {
|
GGML_BLOCK_SIZES = {
|
||||||
"F32": 4,
|
"F32": 4,
|
||||||
"F16": 2,
|
"Q4_0": 2 + 16,
|
||||||
|
"Q5_0": 2 + 4 + 16,
|
||||||
"Q8_0": 2 + 32,
|
"Q8_0": 2 + 32,
|
||||||
"Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
|
"Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
|
||||||
"Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
|
"Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
|
||||||
|
@ -119,7 +124,8 @@ GGML_BLOCK_SIZES = {
|
||||||
|
|
||||||
GGML_ELEMENTS_PER_BLOCK = {
|
GGML_ELEMENTS_PER_BLOCK = {
|
||||||
"F32": 1,
|
"F32": 1,
|
||||||
"F16": 1,
|
"Q4_0": 32,
|
||||||
|
"Q5_0": 32,
|
||||||
"Q8_0": 32,
|
"Q8_0": 32,
|
||||||
"Q2_K": 256,
|
"Q2_K": 256,
|
||||||
"Q3_K": 256,
|
"Q3_K": 256,
|
||||||
|
@ -128,14 +134,6 @@ GGML_ELEMENTS_PER_BLOCK = {
|
||||||
"Q6_K": 256,
|
"Q6_K": 256,
|
||||||
}
|
}
|
||||||
|
|
||||||
# DATA_TYPES = {
|
|
||||||
# "uint32": 4,
|
|
||||||
# "int32": 5,
|
|
||||||
# "float32": 6,
|
|
||||||
# "string": 8,
|
|
||||||
# "array": 9,
|
|
||||||
# "uint64": 10,
|
|
||||||
# }
|
|
||||||
DATA_TYPES = {
|
DATA_TYPES = {
|
||||||
"uint8": 0,
|
"uint8": 0,
|
||||||
"int8": 1,
|
"int8": 1,
|
||||||
|
@ -283,9 +281,11 @@ class GGUFLoader:
|
||||||
|
|
||||||
data = self.get_mmap_tensor(name)
|
data = self.get_mmap_tensor(name)
|
||||||
|
|
||||||
|
|
||||||
if "cuda" in device.lower():
|
if "cuda" in device.lower():
|
||||||
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
values = GGML_DEQUANTIZE_GPU[ggml_name](data, device)
|
||||||
|
#values = GGML_DEQUANTIZE[ggml_name](data)
|
||||||
|
#print("load_gguf_tensor")
|
||||||
|
#values = torch.from_numpy(values).to(device = device)
|
||||||
else:
|
else:
|
||||||
values = GGML_DEQUANTIZE[ggml_name](data)
|
values = GGML_DEQUANTIZE[ggml_name](data)
|
||||||
values = torch.from_numpy(values)
|
values = torch.from_numpy(values)
|
||||||
|
@ -375,7 +375,7 @@ def dequantize_q2_k(data):
|
||||||
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
|
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
|
||||||
|
|
||||||
def dequantize_q2_k_gpu(data):
|
def dequantize_q2_k_gpu(data):
|
||||||
pass
|
raise NotImplementedError()
|
||||||
|
|
||||||
def dequantize_q3_k(data):
|
def dequantize_q3_k(data):
|
||||||
# C implementation
|
# C implementation
|
||||||
|
@ -420,7 +420,7 @@ def dequantize_q3_k(data):
|
||||||
], axis=1)
|
], axis=1)
|
||||||
|
|
||||||
def dequantize_q3_k_gpu(data):
|
def dequantize_q3_k_gpu(data):
|
||||||
pass
|
raise NotImplementedError()
|
||||||
|
|
||||||
def dequantize_q4_k(data):
|
def dequantize_q4_k(data):
|
||||||
# C implementation
|
# C implementation
|
||||||
|
@ -429,20 +429,16 @@ def dequantize_q4_k(data):
|
||||||
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
|
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
|
||||||
block_size = GGML_BLOCK_SIZES["Q4_K"]
|
block_size = GGML_BLOCK_SIZES["Q4_K"]
|
||||||
num_blocks = len(data) // block_size
|
num_blocks = len(data) // block_size
|
||||||
|
|
||||||
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
|
||||||
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
|
||||||
|
|
||||||
# Casting to float32 because float16 is very slow on CPU
|
# Casting to float32 because float16 is very slow on CPU
|
||||||
scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
|
scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
|
||||||
scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
|
scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
|
||||||
qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
|
qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
|
||||||
qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
|
qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
|
||||||
|
|
||||||
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
|
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
|
||||||
factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
|
factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1)
|
||||||
offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
|
offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1)
|
||||||
|
|
||||||
# Interleave low and high quantized bits
|
# Interleave low and high quantized bits
|
||||||
qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
|
qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
|
||||||
# Dequantize final weights using scales and offsets
|
# Dequantize final weights using scales and offsets
|
||||||
|
@ -513,7 +509,7 @@ def dequantize_q5_k(data):
|
||||||
], axis=1)
|
], axis=1)
|
||||||
|
|
||||||
def dequantize_q5_k_gpu(data):
|
def dequantize_q5_k_gpu(data):
|
||||||
pass
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
def dequantize_q6_k(data):
|
def dequantize_q6_k(data):
|
||||||
|
@ -573,6 +569,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
|
||||||
data = torch.from_numpy(data)
|
data = torch.from_numpy(data)
|
||||||
return KTransformersOps.dequantize_q6_k(data, 210, device)
|
return KTransformersOps.dequantize_q6_k(data, 210, device)
|
||||||
|
|
||||||
|
def dequantize_q4_0(data):
|
||||||
|
# C implementation
|
||||||
|
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
|
||||||
|
# C struct definition
|
||||||
|
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
|
||||||
|
num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"]
|
||||||
|
|
||||||
|
scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32)
|
||||||
|
qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:]
|
||||||
|
|
||||||
|
return np.concatenate([
|
||||||
|
scales * ((qs & 0xf).astype(np.int8) - 8),
|
||||||
|
scales * ((qs >> 4).astype(np.int8) - 8),
|
||||||
|
], axis=1)
|
||||||
|
|
||||||
|
def dequantize_q4_0_gpu(data):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def dequantize_q5_0(data):
|
||||||
|
# C implementation
|
||||||
|
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
|
||||||
|
# C struct definition
|
||||||
|
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
|
||||||
|
num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"]
|
||||||
|
|
||||||
|
scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32)
|
||||||
|
qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4]
|
||||||
|
qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:]
|
||||||
|
|
||||||
|
bits = np.unpackbits(qh, axis=-1, bitorder="little")
|
||||||
|
|
||||||
|
x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16
|
||||||
|
x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16
|
||||||
|
|
||||||
|
return np.concatenate([
|
||||||
|
scales * x0,
|
||||||
|
scales * x1,
|
||||||
|
], axis=1)
|
||||||
|
|
||||||
|
def dequantize_q5_0_gpu(data):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def dequantize_q8_0(data):
|
def dequantize_q8_0(data):
|
||||||
# C struct definition
|
# C struct definition
|
||||||
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
|
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
|
||||||
|
@ -615,6 +653,8 @@ def dequantize_f16_gpu(data, device):
|
||||||
GGML_DEQUANTIZE = {
|
GGML_DEQUANTIZE = {
|
||||||
"F32": dequantize_f32,
|
"F32": dequantize_f32,
|
||||||
"F16": dequantize_f16,
|
"F16": dequantize_f16,
|
||||||
|
"Q4_0": dequantize_q4_0,
|
||||||
|
"Q5_0": dequantize_q5_0,
|
||||||
"Q8_0": dequantize_q8_0,
|
"Q8_0": dequantize_q8_0,
|
||||||
"Q2_K": dequantize_q2_k,
|
"Q2_K": dequantize_q2_k,
|
||||||
"Q3_K": dequantize_q3_k,
|
"Q3_K": dequantize_q3_k,
|
||||||
|
@ -626,6 +666,8 @@ GGML_DEQUANTIZE = {
|
||||||
GGML_DEQUANTIZE_GPU = {
|
GGML_DEQUANTIZE_GPU = {
|
||||||
"F32": dequantize_f32_gpu,
|
"F32": dequantize_f32_gpu,
|
||||||
"F16": dequantize_f16_gpu,
|
"F16": dequantize_f16_gpu,
|
||||||
|
"Q4_0": dequantize_q4_0_gpu,
|
||||||
|
"Q5_0": dequantize_q5_0_gpu,
|
||||||
"Q8_0": dequantize_q8_0_gpu,
|
"Q8_0": dequantize_q8_0_gpu,
|
||||||
"Q2_K": dequantize_q2_k_gpu,
|
"Q2_K": dequantize_q2_k_gpu,
|
||||||
"Q3_K": dequantize_q3_k_gpu,
|
"Q3_K": dequantize_q3_k_gpu,
|
||||||
|
|
|
@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
|
||||||
logits = cuda_graph_runner(cur_token, position_ids, cache_position)
|
logits = cuda_graph_runner(cur_token, position_ids, cache_position)
|
||||||
past_key_values.change_seq_length(1)
|
past_key_values.change_seq_length(1)
|
||||||
"""
|
"""
|
||||||
|
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
|
||||||
|
custom_stream = torch.cuda.Stream()
|
||||||
with torch.cuda.stream(custom_stream):
|
with torch.cuda.stream(custom_stream):
|
||||||
logits=model(cur_token,
|
logits=model(inputs_embeds = inputs_embeds,
|
||||||
position_ids = position_ids,
|
position_ids = position_ids,
|
||||||
cache_position = cache_position,
|
cache_position = cache_position,
|
||||||
past_key_values = past_key_values,
|
past_key_values = past_key_values,
|
||||||
return_dict = False, use_cache = True) [0]
|
return_dict = False, use_cache = True) [0]
|
||||||
#"""
|
"""
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
#print(logits)
|
#print(logits)
|
||||||
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
next_token_scores = logits_warper(inputs, logits[:, -1, :])
|
||||||
|
@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
|
||||||
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
|
generated_ids[:, cache_position] = inputs.to(torch_device).to(torch.int)
|
||||||
past_key_values.cur_idx=cache_position
|
past_key_values.cur_idx=cache_position
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
#custom_stream = torch.cuda.Stream()
|
|
||||||
|
|
||||||
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
|
inputs_embeds = model.model.embed_tokens(inputs.to("cpu")).to("cuda")
|
||||||
logits = model(
|
logits = model(
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -67,6 +67,8 @@ class VersionInfo:
|
||||||
"""
|
"""
|
||||||
if sys.platform.startswith("linux"):
|
if sys.platform.startswith("linux"):
|
||||||
return f'linux_{platform.uname().machine}'
|
return f'linux_{platform.uname().machine}'
|
||||||
|
elif sys.platform == "win32":
|
||||||
|
return "win_amd64"
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported platform: {}".format(sys.platform))
|
raise ValueError("Unsupported platform: {}".format(sys.platform))
|
||||||
|
|
||||||
|
@ -97,6 +99,8 @@ class VersionInfo:
|
||||||
return 'avx2'
|
return 'avx2'
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Unsupported cpu Instructions: {}".format(flags_line))
|
"Unsupported cpu Instructions: {}".format(flags_line))
|
||||||
|
elif sys.platform == "win32":
|
||||||
|
return 'native'
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported platform: {}".format(sys.platform))
|
raise ValueError("Unsupported platform: {}".format(sys.platform))
|
||||||
|
|
||||||
|
|
5
third_party/llamafile/iqk_mul_mat.inc
vendored
5
third_party/llamafile/iqk_mul_mat.inc
vendored
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#if defined __x86_64__ || defined __aarch64__
|
#if defined __x86_64__ || defined __aarch64__ || defined(_M_X64)
|
||||||
|
|
||||||
#include "llama.cpp/ggml-impl.h"
|
#include "llama.cpp/ggml-impl.h"
|
||||||
#include "llama.cpp/ggml-quants.h"
|
#include "llama.cpp/ggml-quants.h"
|
||||||
|
@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined __x86_64__
|
#if defined __x86_64__ || defined(_M_X64)
|
||||||
|
|
||||||
#if defined HAVE_FANCY_SIMD
|
#if defined HAVE_FANCY_SIMD
|
||||||
#undef HAVE_FANCY_SIMD
|
#undef HAVE_FANCY_SIMD
|
||||||
|
@ -1412,6 +1412,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
|
||||||
|
|
||||||
bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
|
bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
|
||||||
|
|
||||||
|
if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
|
||||||
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
|
row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
|
||||||
|
|
||||||
switch (typeA) {
|
switch (typeA) {
|
||||||
|
|
|
@ -3,6 +3,6 @@
|
||||||
// Copyrigth 2024 Iwan Kawrakow.
|
// Copyrigth 2024 Iwan Kawrakow.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#include "iqk_mul_mat.inc"
|
#include "iqk_mul_mat.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Iwan Kawrakow.
|
// Copyrigth 2024 Iwan Kawrakow.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define iqk_mul_mat iqk_mul_mat_zen4
|
#define iqk_mul_mat iqk_mul_mat_zen4
|
||||||
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
|
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
|
||||||
#include "iqk_mul_mat.inc"
|
#include "iqk_mul_mat.inc"
|
||||||
|
|
19
third_party/llamafile/sgemm.cpp
vendored
19
third_party/llamafile/sgemm.cpp
vendored
|
@ -22,19 +22,22 @@
|
||||||
|
|
||||||
#include "sgemm.h"
|
#include "sgemm.h"
|
||||||
// #include <cosmo.h>
|
// #include <cosmo.h>
|
||||||
#include <cpuid.h>
|
// #include <cpuid.h>
|
||||||
// #include <libc/sysv/consts/hwcap.h>
|
// #include <libc/sysv/consts/hwcap.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/auxv.h>
|
// #include <sys/auxv.h>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
// #include "llamafile.h"
|
// #include "llamafile.h"
|
||||||
|
|
||||||
static const struct GemmFuncs {
|
static const struct GemmFuncs {
|
||||||
typeof(llamafile_sgemm)* sgemm;
|
bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||||
typeof(llamafile_mixmul)* mixmul;
|
bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||||
typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
|
bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||||
|
// typeof(llamafile_sgemm)* sgemm;
|
||||||
|
// typeof(llamafile_mixmul)* mixmul;
|
||||||
|
// typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
|
||||||
GemmFuncs() {
|
GemmFuncs() {
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
// if (X86_HAVE(AVX)) {
|
// if (X86_HAVE(AVX)) {
|
||||||
// if (X86_HAVE(FMA)) {
|
// if (X86_HAVE(FMA)) {
|
||||||
// if (X86_HAVE(AVX2)) {
|
// if (X86_HAVE(AVX2)) {
|
||||||
|
@ -86,10 +89,12 @@ static const struct GemmFuncs {
|
||||||
// sgemm = llamafile_sgemm_unsupported;
|
// sgemm = llamafile_sgemm_unsupported;
|
||||||
// mixmul = llamafile_mixmul_unsupported;
|
// mixmul = llamafile_mixmul_unsupported;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
#if defined(__AVX__)
|
#if defined(__AVX__)
|
||||||
#if defined(__FMA__)
|
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
#if defined(__AVX512F__)
|
#if defined(__AVX512F__)
|
||||||
|
printf("__AVX512F__\n");
|
||||||
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
|
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
|
||||||
// AMD Zen4+ (2023-)
|
// AMD Zen4+ (2023-)
|
||||||
sgemm = llamafile_sgemm_amd_zen4;
|
sgemm = llamafile_sgemm_amd_zen4;
|
||||||
|
|
2
third_party/llamafile/tinyblas_cpu.h
vendored
2
third_party/llamafile/tinyblas_cpu.h
vendored
|
@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__FMA__)
|
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||||
template <>
|
template <>
|
||||||
inline __m256 madd(__m256 a, __m256 b, __m256 c) {
|
inline __m256 madd(__m256 a, __m256 b, __m256 c) {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_avx
|
#define llamafile_mixmul llamafile_mixmul_amd_avx
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_avx2
|
#define llamafile_mixmul llamafile_mixmul_amd_avx2
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_avx512f
|
#define llamafile_mixmul llamafile_mixmul_amd_avx512f
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
|
#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_fma
|
#define llamafile_mixmul llamafile_mixmul_amd_fma
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_mixmul llamafile_mixmul_amd_zen4
|
#define llamafile_mixmul llamafile_mixmul_amd_zen4
|
||||||
#include "tinyblas_cpu_mixmul.inc"
|
#include "tinyblas_cpu_mixmul.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
4
third_party/llamafile/tinyblas_cpu_sgemm.inc
vendored
4
third_party/llamafile/tinyblas_cpu_sgemm.inc
vendored
|
@ -321,8 +321,8 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
|
||||||
assert(ith < nth);
|
assert(ith < nth);
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#if defined(__AVX2__) && defined(__FMA__)
|
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
|
||||||
// if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
|
// if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
|
||||||
if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
|
if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
|
||||||
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
|
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_avx
|
#define llamafile_sgemm llamafile_sgemm_amd_avx
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_avx2
|
#define llamafile_sgemm llamafile_sgemm_amd_avx2
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_avx512f
|
#define llamafile_sgemm llamafile_sgemm_amd_avx512f
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
|
#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_fma
|
#define llamafile_sgemm llamafile_sgemm_amd_fma
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
#endif // __x86_64__
|
#endif // __x86_64__
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
// Copyrigth 2024 Mozilla Foundation.
|
// Copyrigth 2024 Mozilla Foundation.
|
||||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
#define llamafile_sgemm llamafile_sgemm_amd_zen4
|
#define llamafile_sgemm llamafile_sgemm_amd_zen4
|
||||||
#define iqk_mul_mat iqk_mul_mat_zen4
|
#define iqk_mul_mat iqk_mul_mat_zen4
|
||||||
#include "tinyblas_cpu_sgemm.inc"
|
#include "tinyblas_cpu_sgemm.inc"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue