mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge commit 'c8ade30036
' into concedo_experimental
# Conflicts: # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/im2col_f16.cl # ggml/src/ggml-opencl/kernels/im2col_f32.cl # ggml/src/ggml-sycl/im2col.cpp # tools/mtmd/clip.cpp
This commit is contained in:
commit
0d72c794fa
10 changed files with 418 additions and 127 deletions
|
@ -1614,7 +1614,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.antiprompt.emplace_back(value);
|
params.antiprompt.emplace_back(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sp", "--special"},
|
{"-sp", "--special"},
|
||||||
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
||||||
|
|
|
@ -2,24 +2,13 @@
|
||||||
|
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) {
|
template<typename src_t, typename dst_t>
|
||||||
|
static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
|
||||||
|
if constexpr (std::is_same_v<src_t, dst_t>) {
|
||||||
*dst = *src;
|
*dst = *src;
|
||||||
|
} else {
|
||||||
|
*dst = float(*src);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) {
|
|
||||||
*dst = __float2half(*src);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) {
|
|
||||||
*dst = *src;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) {
|
|
||||||
*dst = *src;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) {
|
|
||||||
*dst = *src;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
|
||||||
|
@ -230,22 +219,7 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||||
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
|
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
template<typename src_t, typename dst_t>
|
||||||
convert_f32_f32((const float *)cxi, (float *)cdsti);
|
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||||
}
|
convert_flt((const src_t *)cxi, (dst_t *)cdsti);
|
||||||
|
|
||||||
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
||||||
convert_f32_f16((const float *)cxi, (half *)cdsti);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
|
|
||||||
convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
|
||||||
convert_f16_f16((const half *)cxi, (half *)cdsti);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
|
||||||
convert_f16_f32((const half *)cxi, (float *)cdsti);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||||
|
|
||||||
template <cpy_kernel_t cpy_1>
|
template <cpy_kernel_t cpy_1>
|
||||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne,
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||||
|
@ -139,43 +139,14 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cpy_f16_f32_cuda(
|
template<typename src_t, typename dst_t>
|
||||||
|
static void ggml_cpy_flt_cuda(
|
||||||
const char * cx, char * cdst, const int ne,
|
const char * cx, char * cdst, const int ne,
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||||
|
|
||||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cpy_f32_f32_cuda(
|
|
||||||
const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
|
||||||
|
|
||||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
|
||||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cpy_f32_bf16_cuda(
|
|
||||||
const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
|
||||||
|
|
||||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
|
||||||
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cpy_f32_f16_cuda(
|
|
||||||
const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
|
||||||
|
|
||||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
|
||||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,16 +278,6 @@ static void ggml_cpy_f32_iq4_nl_cuda(
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cpy_f16_f16_cuda(
|
|
||||||
const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
|
||||||
|
|
||||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
|
||||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection_for_this_node) {
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
@ -372,11 +333,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||||
}
|
}
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||||
ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||||
|
@ -403,9 +364,17 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||||
|
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||||
|
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||||
|
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||||
|
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
@ -430,11 +399,11 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
return (void*) cpy_flt<cpy_1_flt<float, float>>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
|
return (void*) cpy_flt<cpy_1_flt<float, nv_bfloat16>>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_flt<cpy_1_flt<float, half>>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||||
|
@ -458,9 +427,17 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
|
return (void*) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_flt<cpy_1_flt<half, half>>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||||
|
return (void*) cpy_flt<cpy_1_flt<half, nv_bfloat16>>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
return (void*) cpy_flt<cpy_1_flt<half, float>>;
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||||
|
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, half>>;
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||||
|
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, nv_bfloat16>>;
|
||||||
|
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||||
|
return (void*) cpy_flt<cpy_1_flt<nv_bfloat16, float>>;
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
|
|
@ -3247,13 +3247,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
{
|
{
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
ggml_type src1_type = op->src[1]->type;
|
ggml_type src1_type = op->src[1]->type;
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
if ((src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_BF16 || src0_type == GGML_TYPE_F16) &&
|
||||||
return true;
|
(src1_type == GGML_TYPE_F32 || src1_type == GGML_TYPE_BF16 || src1_type == GGML_TYPE_F16)
|
||||||
}
|
) {
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
||||||
|
@ -3289,12 +3285,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -3375,7 +3365,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
return op->src[0]->ne[1] % 128 == 0;
|
return op->src[0]->ne[1] % 128 == 0;
|
||||||
}
|
}
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
return op->src[0]->type != GGML_TYPE_BF16;
|
return true;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
|
|
@ -4,24 +4,8 @@
|
||||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||||
|
|
||||||
template<typename src_t, typename dst_t>
|
template<typename src_t, typename dst_t>
|
||||||
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
||||||
GGML_UNUSED(src_f);
|
convert_flt(src_f, dst_f);
|
||||||
GGML_UNUSED(dst_f);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
|
|
||||||
convert_f32_f16(src_f, dst_h);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
|
|
||||||
convert_f32_bf16(src_f, dst_b);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
|
|
||||||
convert_f32_f32(src_f, dst_f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generic quantized set_rows kernel template
|
// Generic quantized set_rows kernel template
|
||||||
|
|
185
ggml/src/ggml-opencl/kernels/conv2d.cl
Normal file
185
ggml/src/ggml-opencl/kernels/conv2d.cl
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
#ifdef USE_FP16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#define T_FLOAT half
|
||||||
|
#define T_FLOAT4 half4
|
||||||
|
#define VSTORE_T_FLOAT4(data, offset, p) vstore_half4_rte(data, offset, p)
|
||||||
|
#else
|
||||||
|
#define T_FLOAT float
|
||||||
|
#define T_FLOAT4 float4
|
||||||
|
#define VSTORE_T_FLOAT4(data, offset, p) vstore4(data, offset, p)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#else
|
||||||
|
#define REQD_SUBGROUP_SIZE_128
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define T_ACCUM float4
|
||||||
|
#define VEC_SIZE 4
|
||||||
|
|
||||||
|
#define BS_K 64
|
||||||
|
#define BS_NPQ 64
|
||||||
|
#define BS_CRS 16
|
||||||
|
|
||||||
|
#define TS_K 4
|
||||||
|
#define TS_NPQ 8
|
||||||
|
|
||||||
|
#define WG_K (BS_K / TS_K)
|
||||||
|
#define WG_NPQ (BS_NPQ / TS_NPQ)
|
||||||
|
|
||||||
|
#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
|
||||||
|
#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
|
||||||
|
|
||||||
|
static inline uint splitWork(uint work_size, uint block_size){
|
||||||
|
return (work_size + block_size - 1) / block_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
REQD_SUBGROUP_SIZE_128
|
||||||
|
kernel void kernel_conv_2d(
|
||||||
|
global void* p_knl,
|
||||||
|
ulong off_knl,
|
||||||
|
global void* p_src,
|
||||||
|
ulong off_src,
|
||||||
|
global void* p_dst,
|
||||||
|
ulong off_dst,
|
||||||
|
local void* shared,
|
||||||
|
uint Cout, uint Cin, uint N,
|
||||||
|
uint KW, uint KH, uint W, uint H, uint OW, uint OH,
|
||||||
|
uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
|
||||||
|
uint nb01, uint nb02, uint nb03,
|
||||||
|
uint nb11, uint nb12, uint nb13,
|
||||||
|
uint nb1, uint nb2, uint nb3
|
||||||
|
) {
|
||||||
|
global T_FLOAT* knl_data = (global T_FLOAT*) ((global char*)p_knl + off_knl);
|
||||||
|
global T_FLOAT* src_data = (global T_FLOAT*) ((global char*)p_src + off_src);
|
||||||
|
global T_FLOAT* dst_data = (global T_FLOAT*) ((global char*)p_dst + off_dst);
|
||||||
|
|
||||||
|
const uint K = Cout;
|
||||||
|
const uint CRS = Cin*KH*KW;
|
||||||
|
const uint NPQ = N*OH*OW;
|
||||||
|
|
||||||
|
const uint lid_k = get_local_id(0);
|
||||||
|
const uint lid_npq = get_local_id(1);
|
||||||
|
const uint tid = lid_npq * WG_K + lid_k;
|
||||||
|
|
||||||
|
const uint B_idx_K = get_group_id(0);
|
||||||
|
const uint B_idx_NPQ = get_group_id(1);
|
||||||
|
|
||||||
|
const uint offset_k = B_idx_K * BS_K;
|
||||||
|
const uint offset_npq = B_idx_NPQ * BS_NPQ;
|
||||||
|
|
||||||
|
local T_FLOAT* Ash = (local T_FLOAT*)shared;
|
||||||
|
local T_FLOAT4* Bsh = (local T_FLOAT4*) &Ash[BS_K * BS_CRS];
|
||||||
|
|
||||||
|
T_ACCUM regC[TS_K][TS_NPQ_VEC];
|
||||||
|
for (int i = 0; i < TS_K; ++i) {
|
||||||
|
for (int j = 0; j < TS_NPQ_VEC; ++j) {
|
||||||
|
regC[i][j] = (T_ACCUM)(0.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint NB_CRS = splitWork(CRS, BS_CRS);
|
||||||
|
|
||||||
|
for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
|
||||||
|
const uint offset_crs = B_idx_CRS * BS_CRS;
|
||||||
|
|
||||||
|
for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
|
||||||
|
const uint k_l = i / BS_CRS;
|
||||||
|
const uint crs_l = i % BS_CRS;
|
||||||
|
const uint k_g = offset_k + k_l;
|
||||||
|
const uint crs_g = offset_crs + crs_l;
|
||||||
|
|
||||||
|
if (k_g < K && crs_g < CRS) {
|
||||||
|
const uint Cin_idx = crs_g / (KW*KH);
|
||||||
|
const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
|
||||||
|
const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
|
||||||
|
const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
|
||||||
|
Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
|
||||||
|
} else {
|
||||||
|
Ash[k_l * BS_CRS + crs_l] = (T_FLOAT)0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
|
||||||
|
const uint crs_l = i / BS_NPQ_VEC;
|
||||||
|
const uint npq_l_vec = i % BS_NPQ_VEC;
|
||||||
|
const uint crs_g = offset_crs + crs_l;
|
||||||
|
|
||||||
|
T_FLOAT4 val = (T_FLOAT4)(0.0f);
|
||||||
|
if (crs_g < CRS) {
|
||||||
|
const uint Cin_idx = crs_g / (KW * KH);
|
||||||
|
const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
|
||||||
|
const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
|
||||||
|
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||||
|
const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
|
||||||
|
if (npq_g < NPQ) {
|
||||||
|
const uint N_idx = npq_g / (OH * OW);
|
||||||
|
const uint pq_idx = npq_g % (OH * OW);
|
||||||
|
const uint OH_idx = pq_idx / OW;
|
||||||
|
const uint OW_idx = pq_idx % OW;
|
||||||
|
const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
|
||||||
|
const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
|
||||||
|
|
||||||
|
if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
|
||||||
|
const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
|
||||||
|
((T_FLOAT*)&val)[v] = src_data[src_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
|
||||||
|
T_FLOAT regA[TS_K];
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||||
|
T_FLOAT4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), convert_float4(regB), regC[k_l_reg][npq_l_vec_reg]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
|
||||||
|
if (k_g >= K) continue;
|
||||||
|
|
||||||
|
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||||
|
const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
|
||||||
|
|
||||||
|
const uint N_idx = npq_g_base / (OH * OW);
|
||||||
|
const uint pq_idx = npq_g_base % (OH * OW);
|
||||||
|
const uint OH_idx = pq_idx / OW;
|
||||||
|
const uint OW_idx = pq_idx % OW;
|
||||||
|
|
||||||
|
if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
|
||||||
|
const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
|
||||||
|
VSTORE_T_FLOAT4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
|
||||||
|
} else {
|
||||||
|
T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
|
||||||
|
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||||
|
const uint npq_g = npq_g_base + v;
|
||||||
|
if (npq_g < NPQ) {
|
||||||
|
const uint N_idx_s = npq_g / (OH*OW);
|
||||||
|
const uint pq_idx_s = npq_g % (OH*OW);
|
||||||
|
const uint OH_idx_s = pq_idx_s / OW;
|
||||||
|
const uint OW_idx_s = pq_idx_s % OW;
|
||||||
|
const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
|
||||||
|
dst_data[dst_idx_s] = (T_FLOAT)(((float*)&res)[v]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
176
ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
Normal file
176
ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl
Normal file
|
@ -0,0 +1,176 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#if defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#else
|
||||||
|
#define REQD_SUBGROUP_SIZE_128
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define T_ACCUM float4
|
||||||
|
#define VEC_SIZE 4
|
||||||
|
|
||||||
|
#define BS_K 64
|
||||||
|
#define BS_NPQ 64
|
||||||
|
#define BS_CRS 16
|
||||||
|
|
||||||
|
#define TS_K 4
|
||||||
|
#define TS_NPQ 8
|
||||||
|
|
||||||
|
#define WG_K (BS_K / TS_K)
|
||||||
|
#define WG_NPQ (BS_NPQ / TS_NPQ)
|
||||||
|
|
||||||
|
#define BS_NPQ_VEC (BS_NPQ / VEC_SIZE)
|
||||||
|
#define TS_NPQ_VEC (TS_NPQ / VEC_SIZE)
|
||||||
|
|
||||||
|
static inline uint splitWork(uint work_size, uint block_size){
|
||||||
|
return (work_size + block_size - 1) / block_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
REQD_SUBGROUP_SIZE_128
|
||||||
|
kernel void kernel_conv_2d(
|
||||||
|
global void* p_knl,
|
||||||
|
ulong off_knl,
|
||||||
|
global void* p_src,
|
||||||
|
ulong off_src,
|
||||||
|
global void* p_dst,
|
||||||
|
ulong off_dst,
|
||||||
|
local void* shared,
|
||||||
|
uint Cout, uint Cin, uint N,
|
||||||
|
uint KW, uint KH, uint W, uint H, uint OW, uint OH,
|
||||||
|
uint s0, uint s1, uint p0, uint p1, uint d0, uint d1,
|
||||||
|
uint nb01, uint nb02, uint nb03,
|
||||||
|
uint nb11, uint nb12, uint nb13,
|
||||||
|
uint nb1, uint nb2, uint nb3
|
||||||
|
) {
|
||||||
|
global half* knl_data = (global half*) ((global char*)p_knl + off_knl);
|
||||||
|
global float* src_data = (global float*) ((global char*)p_src + off_src);
|
||||||
|
global float* dst_data = (global float*) ((global char*)p_dst + off_dst);
|
||||||
|
|
||||||
|
const uint K = Cout;
|
||||||
|
const uint CRS = Cin*KH*KW;
|
||||||
|
const uint NPQ = N*OH*OW;
|
||||||
|
|
||||||
|
const uint lid_k = get_local_id(0);
|
||||||
|
const uint lid_npq = get_local_id(1);
|
||||||
|
const uint tid = lid_npq * WG_K + lid_k;
|
||||||
|
|
||||||
|
const uint B_idx_K = get_group_id(0);
|
||||||
|
const uint B_idx_NPQ = get_group_id(1);
|
||||||
|
|
||||||
|
const uint offset_k = B_idx_K * BS_K;
|
||||||
|
const uint offset_npq = B_idx_NPQ * BS_NPQ;
|
||||||
|
|
||||||
|
local half* Ash = (local half*)shared;
|
||||||
|
local float4* Bsh = (local float4*) &Ash[BS_K * BS_CRS];
|
||||||
|
|
||||||
|
T_ACCUM regC[TS_K][TS_NPQ_VEC];
|
||||||
|
for (int i = 0; i < TS_K; ++i) {
|
||||||
|
for (int j = 0; j < TS_NPQ_VEC; ++j) {
|
||||||
|
regC[i][j] = (T_ACCUM)(0.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint NB_CRS = splitWork(CRS, BS_CRS);
|
||||||
|
|
||||||
|
for (uint B_idx_CRS = 0; B_idx_CRS < NB_CRS; ++B_idx_CRS) {
|
||||||
|
const uint offset_crs = B_idx_CRS * BS_CRS;
|
||||||
|
|
||||||
|
for (int i = tid; i < BS_K * BS_CRS; i += (WG_K * WG_NPQ)) {
|
||||||
|
const uint k_l = i / BS_CRS;
|
||||||
|
const uint crs_l = i % BS_CRS;
|
||||||
|
const uint k_g = offset_k + k_l;
|
||||||
|
const uint crs_g = offset_crs + crs_l;
|
||||||
|
|
||||||
|
if (k_g < K && crs_g < CRS) {
|
||||||
|
const uint Cin_idx = crs_g / (KW*KH);
|
||||||
|
const uint KH_idx = (crs_g - Cin_idx*KW*KH) / KW;
|
||||||
|
const uint KW_idx = crs_g - Cin_idx*KW*KH - KH_idx*KW;
|
||||||
|
const uint knl_idx = KW_idx + KH_idx*nb01 + Cin_idx*nb02 + k_g*nb03;
|
||||||
|
Ash[k_l * BS_CRS + crs_l] = knl_data[knl_idx];
|
||||||
|
} else {
|
||||||
|
Ash[k_l * BS_CRS + crs_l] = (half)0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = tid; i < BS_CRS * BS_NPQ_VEC; i += (WG_K * WG_NPQ)) {
|
||||||
|
const uint crs_l = i / BS_NPQ_VEC;
|
||||||
|
const uint npq_l_vec = i % BS_NPQ_VEC;
|
||||||
|
const uint crs_g = offset_crs + crs_l;
|
||||||
|
|
||||||
|
float4 val = (float4)(0.0f);
|
||||||
|
if (crs_g < CRS) {
|
||||||
|
const uint Cin_idx = crs_g / (KW * KH);
|
||||||
|
const uint KH_idx = (crs_g - Cin_idx * KW * KH) / KW;
|
||||||
|
const uint KW_idx = crs_g - Cin_idx * KW * KH - KH_idx * KW;
|
||||||
|
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||||
|
const uint npq_g = offset_npq + npq_l_vec * VEC_SIZE + v;
|
||||||
|
if (npq_g < NPQ) {
|
||||||
|
const uint N_idx = npq_g / (OH * OW);
|
||||||
|
const uint pq_idx = npq_g % (OH * OW);
|
||||||
|
const uint OH_idx = pq_idx / OW;
|
||||||
|
const uint OW_idx = pq_idx % OW;
|
||||||
|
const int H_idx = (int)(OH_idx * s1 + KH_idx * d1 - p1);
|
||||||
|
const int W_idx = (int)(OW_idx * s0 + KW_idx * d0 - p0);
|
||||||
|
|
||||||
|
if (H_idx >= 0 && H_idx < H && W_idx >= 0 && W_idx < W) {
|
||||||
|
const uint src_idx = W_idx + H_idx * nb11 + Cin_idx * nb12 + N_idx * nb13;
|
||||||
|
((float*)&val)[v] = src_data[src_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Bsh[crs_l * BS_NPQ_VEC + npq_l_vec] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (uint crs_l = 0; crs_l < BS_CRS; ++crs_l) {
|
||||||
|
half regA[TS_K];
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
regA[k_l_reg] = Ash[(lid_k * TS_K + k_l_reg) * BS_CRS + crs_l];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||||
|
float4 regB = Bsh[crs_l * BS_NPQ_VEC + lid_npq * TS_NPQ_VEC + npq_l_vec_reg];
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
regC[k_l_reg][npq_l_vec_reg] = mad(convert_float(regA[k_l_reg]), regB, regC[k_l_reg][npq_l_vec_reg]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint k_l_reg = 0; k_l_reg < TS_K; ++k_l_reg) {
|
||||||
|
const uint k_g = offset_k + lid_k * TS_K + k_l_reg;
|
||||||
|
if (k_g >= K) continue;
|
||||||
|
|
||||||
|
for (uint npq_l_vec_reg = 0; npq_l_vec_reg < TS_NPQ_VEC; ++npq_l_vec_reg) {
|
||||||
|
const uint npq_g_base = offset_npq + (lid_npq * TS_NPQ_VEC + npq_l_vec_reg) * VEC_SIZE;
|
||||||
|
|
||||||
|
const uint N_idx = npq_g_base / (OH * OW);
|
||||||
|
const uint pq_idx = npq_g_base % (OH * OW);
|
||||||
|
const uint OH_idx = pq_idx / OW;
|
||||||
|
const uint OW_idx = pq_idx % OW;
|
||||||
|
|
||||||
|
if (nb1 == OW && OW_idx + VEC_SIZE <= OW && npq_g_base + VEC_SIZE <= NPQ) {
|
||||||
|
const uint dst_idx = OW_idx + OH_idx*nb1 + k_g*nb2 + N_idx*nb3;
|
||||||
|
vstore4(regC[k_l_reg][npq_l_vec_reg], 0, &dst_data[dst_idx]);
|
||||||
|
} else {
|
||||||
|
T_ACCUM res = regC[k_l_reg][npq_l_vec_reg];
|
||||||
|
for (int v = 0; v < VEC_SIZE; ++v) {
|
||||||
|
const uint npq_g = npq_g_base + v;
|
||||||
|
if (npq_g < NPQ) {
|
||||||
|
const uint N_idx_s = npq_g / (OH*OW);
|
||||||
|
const uint pq_idx_s = npq_g % (OH*OW);
|
||||||
|
const uint OH_idx_s = pq_idx_s / OW;
|
||||||
|
const uint OW_idx_s = pq_idx_s % OW;
|
||||||
|
const uint dst_idx_s = OW_idx_s + OH_idx_s*nb1 + k_g*nb2 + N_idx_s*nb3;
|
||||||
|
dst_data[dst_idx_s] = ((float*)&res)[v];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -63,7 +63,7 @@ dry_seq_break_max = 128
|
||||||
extra_images_max = 4
|
extra_images_max = 4
|
||||||
|
|
||||||
# global vars
|
# global vars
|
||||||
KcppVersion = "1.96.2"
|
KcppVersion = "1.97"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
kcpp_instance = None #global running instance
|
kcpp_instance = None #global running instance
|
||||||
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}
|
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}
|
||||||
|
|
|
@ -392,7 +392,7 @@ struct clip_ctx {
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
|
||||||
ggml_backend_t backend = nullptr;
|
ggml_backend_t backend = nullptr;
|
||||||
ggml_backend_t backend_cpu;
|
ggml_backend_t backend_cpu = nullptr;
|
||||||
ggml_backend_buffer_ptr buf;
|
ggml_backend_buffer_ptr buf;
|
||||||
|
|
||||||
int max_nodes = 8192;
|
int max_nodes = 8192;
|
||||||
|
|
|
@ -253,6 +253,7 @@ struct server_task {
|
||||||
defaults.sampling = params_base.sampling;
|
defaults.sampling = params_base.sampling;
|
||||||
defaults.speculative = params_base.speculative;
|
defaults.speculative = params_base.speculative;
|
||||||
defaults.n_keep = params_base.n_keep;
|
defaults.n_keep = params_base.n_keep;
|
||||||
|
defaults.antiprompt = params_base.antiprompt;
|
||||||
|
|
||||||
// enabling this will output extra debug information in the HTTP responses from the server
|
// enabling this will output extra debug information in the HTTP responses from the server
|
||||||
params.verbose = params_base.verbosity > 9;
|
params.verbose = params_base.verbosity > 9;
|
||||||
|
@ -490,6 +491,10 @@ struct server_task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// set reverse prompt from cli args if not set in the request
|
||||||
|
if (params.antiprompt.empty()) {
|
||||||
|
params.antiprompt = defaults.antiprompt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue