ggml : Parallelize quant LUT init (#23595)

- Use OpenMP to parallelize iq2xs_init_impl and iq3xs_init_impl.
- Move the OpenMP detection from ggml-cpu to ggml-base.
- Update OpenMP dependencies in ggml-config.cmake.in.
This commit is contained in:
Jeff Bolz 2026-05-25 02:15:46 -05:00 committed by GitHub
parent b96487645c
commit 826539ce59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 238 additions and 119 deletions

View file

@ -6,6 +6,7 @@
include(CMakeFindDependencyMacro)
find_dependency(Threads)
if (NOT GGML_SHARED_LIB)
set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
if (GGML_OPENMP_ENABLED)
find_dependency(OpenMP)
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
if (TARGET OpenMP::OpenMP_C)
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
endif()
if (TARGET OpenMP::OpenMP_CXX)
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
endif()
list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
endif()
if (GGML_CPU_HBM)
@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
add_library(ggml::ggml-base UNKNOWN IMPORTED)
set_target_properties(ggml::ggml-base
PROPERTIES
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
set(_ggml_all_targets "")
if (NOT GGML_BACKEND_DL)

View file

@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
endif()
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
else()
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
message(WARNING "OpenMP not found")
endif()
else()
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
endif()
if (GGML_OPENMP_ENABLED)
target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
endif()
add_library(ggml
ggml-backend-dl.cpp
ggml-backend-reg.cpp)

View file

@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
endif()
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
else()
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
message(WARNING "OpenMP not found")
endif()
if (GGML_OPENMP_ENABLED)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
endif()
if (GGML_LLAMAFILE)

View file

@ -13,6 +13,10 @@
#include <stdlib.h> // for qsort
#include <stdio.h> // for GGML_ASSERT
#ifdef GGML_USE_OPENMP
#include <omp.h>
#endif
#define GROUP_MAX_EPS 1e-15f
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
#define GROUP_MAX_EPS_IQ2_S 1e-8f
@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) {
}
kmap_q2xs[index] = i;
}
int8_t pos[8];
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
// The neighbour search runs in three passes:
// 1. Parallel: for each i, qsort and count its neighbours into n_per_i,
// and reduce the totals (num_neighbors, num_not_in_map).
// 2. Serial: prefix-sum n_per_i into offsets[], so each i has a
// pre-assigned slice of kneighbors_q2xs to write into.
// 3. Parallel: redo the qsort and write each i's neighbour list at
// offsets[i].
int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
GGML_ASSERT(n_per_i);
int num_neighbors = 0, num_not_in_map = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) continue;
++num_not_in_map;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
int n = 0; int d2 = dist2[0];
int nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
#ifdef GGML_USE_OPENMP
#pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
#endif
{
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
GGML_ASSERT(dist2);
int8_t pos[8];
int i;
#ifdef GGML_USE_OPENMP
#pragma omp for schedule(dynamic, 64)
#endif
for (i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) {
n_per_i[i] = 0;
continue;
}
++n;
++num_not_in_map;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
int n = 0; int d2 = dist2[0];
int nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
++n;
}
n_per_i[i] = n;
num_neighbors += n;
}
num_neighbors += n;
free(dist2);
}
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
iq2_data[gindex].neighbours = kneighbors_q2xs;
int * offsets = (int *)malloc(kmap_size*sizeof(int));
GGML_ASSERT(offsets);
int counter = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) continue;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
if (kmap_q2xs[i] >= 0) {
offsets[i] = -1;
continue;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
kmap_q2xs[i] = -(counter + 1);
int d2 = dist2[0];
uint16_t * start = &kneighbors_q2xs[counter++];
int n = 0, nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
kneighbors_q2xs[counter++] = dist2[2*j+1];
++n;
}
*start = n;
offsets[i] = counter;
counter += 1 + n_per_i[i];
}
free(dist2);
#ifdef GGML_USE_OPENMP
#pragma omp parallel
#endif
{
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
GGML_ASSERT(dist2);
int8_t pos[8];
int i;
#ifdef GGML_USE_OPENMP
#pragma omp for schedule(dynamic, 64)
#endif
for (i = 0; i < kmap_size; ++i) {
if (kmap_q2xs[i] >= 0) continue;
for (int k = 0; k < 8; ++k) {
int l = (i >> 2*k) & 0x3;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
int d2 = 0;
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
int local_counter = offsets[i];
kmap_q2xs[i] = -(local_counter + 1);
int d2 = dist2[0];
uint16_t * start = &kneighbors_q2xs[local_counter++];
int n = 0, nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
kneighbors_q2xs[local_counter++] = dist2[2*j+1];
++n;
}
*start = n;
}
free(dist2);
}
free(offsets);
free(n_per_i);
}
void iq2xs_free_impl(enum ggml_type type) {
@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) {
}
kmap_q3xs[index] = i;
}
int8_t pos[4];
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
// See explanation of parallelism in iq2xs_init_impl
int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
GGML_ASSERT(n_per_i);
int num_neighbors = 0, num_not_in_map = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q3xs[i] >= 0) continue;
++num_not_in_map;
for (int k = 0; k < 4; ++k) {
int l = (i >> 3*k) & 0x7;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
int d2 = 0;
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
int n = 0; int d2 = dist2[0];
int nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
#ifdef GGML_USE_OPENMP
#pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
#endif
{
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
GGML_ASSERT(dist2);
int8_t pos[4];
int i;
#ifdef GGML_USE_OPENMP
#pragma omp for schedule(dynamic, 64)
#endif
for (i = 0; i < kmap_size; ++i) {
if (kmap_q3xs[i] >= 0) {
n_per_i[i] = 0;
continue;
}
++n;
++num_not_in_map;
for (int k = 0; k < 4; ++k) {
int l = (i >> 3*k) & 0x7;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
int d2 = 0;
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
int n = 0; int d2 = dist2[0];
int nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
++n;
}
n_per_i[i] = n;
num_neighbors += n;
}
num_neighbors += n;
free(dist2);
}
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
iq3_data[gindex].neighbours = kneighbors_q3xs;
int * offsets = (int *)malloc(kmap_size*sizeof(int));
GGML_ASSERT(offsets);
int counter = 0;
for (int i = 0; i < kmap_size; ++i) {
if (kmap_q3xs[i] >= 0) continue;
for (int k = 0; k < 4; ++k) {
int l = (i >> 3*k) & 0x7;
pos[k] = 2*l + 1;
if (kmap_q3xs[i] >= 0) {
offsets[i] = -1;
continue;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
int d2 = 0;
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
kmap_q3xs[i] = -(counter + 1);
int d2 = dist2[0];
uint16_t * start = &kneighbors_q3xs[counter++];
int n = 0, nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
kneighbors_q3xs[counter++] = dist2[2*j+1];
++n;
}
*start = n;
offsets[i] = counter;
counter += 1 + n_per_i[i];
}
free(dist2);
#ifdef GGML_USE_OPENMP
#pragma omp parallel
#endif
{
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
GGML_ASSERT(dist2);
int8_t pos[4];
int i;
#ifdef GGML_USE_OPENMP
#pragma omp for schedule(dynamic, 64)
#endif
for (i = 0; i < kmap_size; ++i) {
if (kmap_q3xs[i] >= 0) continue;
for (int k = 0; k < 4; ++k) {
int l = (i >> 3*k) & 0x7;
pos[k] = 2*l + 1;
}
for (int j = 0; j < grid_size; ++j) {
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
int d2 = 0;
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
dist2[2*j+0] = d2;
dist2[2*j+1] = j;
}
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
int local_counter = offsets[i];
kmap_q3xs[i] = -(local_counter + 1);
int d2 = dist2[0];
uint16_t * start = &kneighbors_q3xs[local_counter++];
int n = 0, nhave = 1;
for (int j = 0; j < grid_size; ++j) {
if (dist2[2*j] > d2) {
if (nhave == nwant) break;
d2 = dist2[2*j];
++nhave;
}
kneighbors_q3xs[local_counter++] = dist2[2*j+1];
++n;
}
*start = n;
}
free(dist2);
}
free(offsets);
free(n_per_i);
}
void iq3xs_free_impl(int grid_size) {