mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/full-rocm.Dockerfile # .devops/llama-cli-rocm.Dockerfile # .devops/llama-server-rocm.Dockerfile # .github/workflows/build.yml # .github/workflows/python-type-check.yml # CMakeLists.txt # CONTRIBUTING.md # README.md # ci/run.sh # examples/embedding/embedding.cpp # examples/server/README.md # flake.lock # ggml/include/ggml.h # ggml/src/ggml.c # requirements/requirements-convert_legacy_llama.txt # scripts/sync-ggml.last # src/llama-vocab.cpp # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-grad0.cpp # tests/test-tokenizer-0.cpp
This commit is contained in:
commit
ce7f9c9a2c
39 changed files with 103400 additions and 102738 deletions
|
@ -20,6 +20,8 @@
|
|||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <thread>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
@ -607,13 +609,16 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
|
||||
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
||||
|
||||
static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
||||
// variables to track number of compiles in progress
|
||||
static uint32_t compile_count = 0;
|
||||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
||||
GGML_ASSERT(parameter_count > 0);
|
||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
|
||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
pipeline->name = name;
|
||||
pipeline->parameter_count = parameter_count;
|
||||
|
@ -681,7 +686,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
|
|||
pipeline->layout);
|
||||
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
||||
|
||||
device->pipelines.insert({ pipeline->name, pipeline });
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
device->pipelines.insert({ pipeline->name, pipeline });
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
||||
|
@ -1079,7 +1094,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
|
|||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else {
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
// use rebar if available, otherwise fallback to device only visible memory
|
||||
buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
|
@ -1148,11 +1164,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
// mulmat
|
||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { device->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { std::max(device->subgroup_size, 16u), 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size };
|
||||
|
||||
std::initializer_list<uint32_t> warptile_mmq_l = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_mmq_m = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_mmq_s = { device->subgroup_size, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_mmq_s = { std::max(device->subgroup_size, 16u), 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size };
|
||||
|
||||
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
||||
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
||||
|
@ -1193,6 +1209,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
std::vector<std::future<void>> compiles;
|
||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
||||
{
|
||||
// wait until fewer than N compiles are in progress
|
||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
||||
while (compile_count >= N) {
|
||||
compile_count_cond.wait(guard);
|
||||
}
|
||||
compile_count++;
|
||||
}
|
||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
|
||||
};
|
||||
|
||||
if (device->fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
|
@ -1742,6 +1772,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
}
|
||||
|
||||
static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
@ -2806,7 +2840,11 @@ static void ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t
|
|||
|
||||
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
|
||||
// If the device is not an UMA device the memory is host-accessible through rebar. While writing
|
||||
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
|
||||
// the HW device to host copy path.
|
||||
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
||||
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
||||
|
@ -5008,6 +5046,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
}
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
@ -5124,7 +5164,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
|
||||
avg_err /= m * n;
|
||||
|
||||
std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms avg_err=" << avg_err << std::endl;
|
||||
double tflops = 2.0*m*n*k*batch*num_it / (time / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
|
||||
|
||||
std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
||||
|
||||
if (avg_err > 0.1) {
|
||||
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
||||
|
@ -5246,12 +5288,14 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
@ -5378,6 +5422,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||
}
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
||||
|
||||
|
@ -5445,7 +5491,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||
|
||||
avg_err /= m * n;
|
||||
|
||||
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
||||
double tflops = 2.0*m*n*k*batch*num_it / (time_ms / 1000.0) / (1000.0*1000.0*1000.0*1000.0);
|
||||
|
||||
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl;
|
||||
|
||||
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
||||
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
||||
|
@ -5497,9 +5545,6 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|||
|
||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue