You'll never take us alive

We swore that death will do us part
They'll call our crimes a work of art
This commit is contained in:
Concedo 2025-01-09 11:27:06 +08:00
commit e788b8289a
19 changed files with 1866 additions and 1704 deletions

View file

@ -1361,7 +1361,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
}
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
if (!tensor->clblast_offload_gpu) {
return;
}
@ -1419,7 +1419,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
}
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(src1->clblast_offload_gpu);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
@ -1483,7 +1483,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
}
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(src1->clblast_offload_gpu);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
@ -1573,13 +1573,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
size_t y_size;
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
if (src0->clblast_offload_gpu) { // NOLINT
d_X = (cl_mem) src0->extra;
} else {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
}
cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Y = src1->clblast_offload_gpu ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = dst->clblast_offload_gpu ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0;
@ -1587,7 +1587,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
// TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
if (src0->clblast_offload_gpu) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else {
// copy src0 to device
@ -1596,7 +1596,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// copy src1 to device
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
if (!src1->clblast_offload_gpu) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
}
@ -1620,7 +1620,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
}
// copy dst to host
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
if (!dst->clblast_offload_gpu) {
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
@ -1629,13 +1629,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
}
}
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
if (!src0->clblast_offload_gpu) {
ggml_cl_pool_free(d_X, x_size);
}
if (src1->backend != GGML_BACKEND_TYPE_GPU) {
if (!src1->clblast_offload_gpu) {
ggml_cl_pool_free(d_Y, y_size);
}
if (dst->backend != GGML_BACKEND_TYPE_GPU) {
if (!dst->clblast_offload_gpu) {
ggml_cl_pool_free(d_D, d_size);
}
}
@ -1678,7 +1678,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
size_t y_size;
size_t d_size;
cl_mem d_X;
if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
if (src0->clblast_offload_gpu) { // NOLINT
d_X = (cl_mem) src0->extra;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@ -1695,7 +1695,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
// TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
if (src0->clblast_offload_gpu) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else {
// copy src0 to device
@ -1750,7 +1750,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
}
// copy dst to host, then convert to float
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
if (!dst->clblast_offload_gpu) {
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
@ -1762,7 +1762,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
}
}
if (src0->backend != GGML_BACKEND_TYPE_GPU) {
if (!src0->clblast_offload_gpu) {
ggml_cl_pool_free(d_X, x_size);
}
ggml_cl_pool_free(d_Y, y_size);
@ -1807,7 +1807,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Q;
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
if (!src0->clblast_offload_gpu) {
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
}
@ -1830,10 +1830,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
// copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
if (!src0->clblast_offload_gpu) {
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
} else if (src0->clblast_offload_gpu) {
d_Q = (cl_mem) src0->extra;
} else {
GGML_ASSERT(false);
@ -1842,7 +1842,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
if (!mul_mat_vec) {
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const size_t offset = src0->clblast_offload_gpu ? (i03 * ne02 + i02) * x_bps : 0;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@ -1859,7 +1859,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
// compute
const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const size_t offset = src0->clblast_offload_gpu ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@ -1912,7 +1912,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
}
ggml_cl_pool_free(d_Y, y_size);
ggml_cl_pool_free(d_D, d_size);
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
if (!src0->clblast_offload_gpu) {
ggml_cl_pool_free(d_Q, q_size);
}
}
@ -1928,7 +1928,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->clblast_offload_gpu)) {
return true;
}
@ -2010,5 +2010,5 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
CL_CHECK(clFinish(queue));
tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(tensor->clblast_offload_gpu);
}