mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-30 20:33:39 +00:00
ggml : Parallelize quant LUT init (#23595)
- Use OpenMP to parallelize iq2xs_init_impl and iq3xs_init_impl. - Move the OpenMP detection from ggml-cpu to ggml-base. - Update OpenMP dependencies in ggml-config.cmake.in.
This commit is contained in:
parent
b96487645c
commit
826539ce59
4 changed files with 238 additions and 119 deletions
|
|
@ -6,6 +6,7 @@
|
|||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Threads)
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
||||
|
||||
|
|
@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
|
|||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
find_dependency(OpenMP)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
|
||||
if (TARGET OpenMP::OpenMP_C)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
|
||||
endif()
|
||||
if (TARGET OpenMP::OpenMP_CXX)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
|
|
@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
|
|||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
|
|
|
|||
|
|
@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
|
|||
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-dl.cpp
|
||||
ggml-backend-reg.cpp)
|
||||
|
|
|
|||
|
|
@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,10 @@
|
|||
#include <stdlib.h> // for qsort
|
||||
#include <stdio.h> // for GGML_ASSERT
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#define GROUP_MAX_EPS 1e-15f
|
||||
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
|
||||
#define GROUP_MAX_EPS_IQ2_S 1e-8f
|
||||
|
|
@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|||
}
|
||||
kmap_q2xs[index] = i;
|
||||
}
|
||||
int8_t pos[8];
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
// The neighbour search runs in three passes:
|
||||
// 1. Parallel: for each i, qsort and count its neighbours into n_per_i,
|
||||
// and reduce the totals (num_neighbors, num_not_in_map).
|
||||
// 2. Serial: prefix-sum n_per_i into offsets[], so each i has a
|
||||
// pre-assigned slice of kneighbors_q2xs to write into.
|
||||
// 3. Parallel: redo the qsort and write each i's neighbour list at
|
||||
// offsets[i].
|
||||
int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
|
||||
GGML_ASSERT(n_per_i);
|
||||
int num_neighbors = 0, num_not_in_map = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q2xs[i] >= 0) continue;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int l = (i >> 2*k) & 0x3;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
|
||||
#endif
|
||||
{
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
GGML_ASSERT(dist2);
|
||||
int8_t pos[8];
|
||||
int i;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp for schedule(dynamic, 64)
|
||||
#endif
|
||||
for (i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q2xs[i] >= 0) {
|
||||
n_per_i[i] = 0;
|
||||
continue;
|
||||
}
|
||||
++n;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int l = (i >> 2*k) & 0x3;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
n_per_i[i] = n;
|
||||
num_neighbors += n;
|
||||
}
|
||||
num_neighbors += n;
|
||||
free(dist2);
|
||||
}
|
||||
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
||||
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
||||
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
||||
|
||||
int * offsets = (int *)malloc(kmap_size*sizeof(int));
|
||||
GGML_ASSERT(offsets);
|
||||
int counter = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q2xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int l = (i >> 2*k) & 0x3;
|
||||
pos[k] = 2*l + 1;
|
||||
if (kmap_q2xs[i] >= 0) {
|
||||
offsets[i] = -1;
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
||||
kmap_q2xs[i] = -(counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q2xs[counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q2xs[counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
offsets[i] = counter;
|
||||
counter += 1 + n_per_i[i];
|
||||
}
|
||||
free(dist2);
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
GGML_ASSERT(dist2);
|
||||
int8_t pos[8];
|
||||
int i;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp for schedule(dynamic, 64)
|
||||
#endif
|
||||
for (i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q2xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int l = (i >> 2*k) & 0x3;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
||||
int local_counter = offsets[i];
|
||||
kmap_q2xs[i] = -(local_counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q2xs[local_counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q2xs[local_counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
}
|
||||
free(dist2);
|
||||
}
|
||||
free(offsets);
|
||||
free(n_per_i);
|
||||
}
|
||||
|
||||
void iq2xs_free_impl(enum ggml_type type) {
|
||||
|
|
@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) {
|
|||
}
|
||||
kmap_q3xs[index] = i;
|
||||
}
|
||||
int8_t pos[4];
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
// See explanation of parallelism in iq2xs_init_impl
|
||||
int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
|
||||
GGML_ASSERT(n_per_i);
|
||||
int num_neighbors = 0, num_not_in_map = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
|
||||
#endif
|
||||
{
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
GGML_ASSERT(dist2);
|
||||
int8_t pos[4];
|
||||
int i;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp for schedule(dynamic, 64)
|
||||
#endif
|
||||
for (i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) {
|
||||
n_per_i[i] = 0;
|
||||
continue;
|
||||
}
|
||||
++n;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
n_per_i[i] = n;
|
||||
num_neighbors += n;
|
||||
}
|
||||
num_neighbors += n;
|
||||
free(dist2);
|
||||
}
|
||||
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
||||
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
||||
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
||||
|
||||
int * offsets = (int *)malloc(kmap_size*sizeof(int));
|
||||
GGML_ASSERT(offsets);
|
||||
int counter = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
if (kmap_q3xs[i] >= 0) {
|
||||
offsets[i] = -1;
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
kmap_q3xs[i] = -(counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q3xs[counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q3xs[counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
offsets[i] = counter;
|
||||
counter += 1 + n_per_i[i];
|
||||
}
|
||||
free(dist2);
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
GGML_ASSERT(dist2);
|
||||
int8_t pos[4];
|
||||
int i;
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp for schedule(dynamic, 64)
|
||||
#endif
|
||||
for (i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
int local_counter = offsets[i];
|
||||
kmap_q3xs[i] = -(local_counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q3xs[local_counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q3xs[local_counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
}
|
||||
free(dist2);
|
||||
}
|
||||
free(offsets);
|
||||
free(n_per_i);
|
||||
}
|
||||
|
||||
void iq3xs_free_impl(int grid_size) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue