CANN: Improve loading efficiency after converting weights to NZ format. (#14985)

* CANN: Improve loading efficiency after converting weights to NZ format.

* CANN: fix typo
This commit is contained in:
hipudding 2025-07-31 19:47:20 +08:00 committed by GitHub
parent 66625a59a5
commit 11490b3672
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 70 additions and 58 deletions

View file

@ -310,5 +310,7 @@ Specifies the memory pool management strategy:
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies. Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
## TODO ### GGML_CANN_WEIGHT_NZ
- Support more models and data types.
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.

View file

@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
bcast_weight_nb[4], bcast_weight_nb[5]}; bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor; aclTensor* acl_weight_tensor;
bool weightToNZ = false; // Only check env once.
#ifdef ASCEND_310P static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); if (weight_to_nz && is_matmul_weight(weight)) {
#endif
if (weightToNZ && is_matmul_weight(weight)) {
int64_t acl_stride[2] = {1, transpose_ne[1]}; int64_t acl_stride[2] = {1, transpose_ne[1]};
// Reverse ne. // Reverse ne.

View file

@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr, // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
aclDataType dataType, aclTensor **tensor) namespace {
{ void* g_nz_workspace = nullptr;
uint64_t size = 1; size_t g_nz_workspace_allocated = 0;
for (auto i : shape) {
size *= i; void release_nz_workspace() {
if (g_nz_workspace) {
aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
g_nz_workspace_allocated = 0;
}
} }
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); void relloc_nz_workspace(size_t new_size) {
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); if (new_size > g_nz_workspace_allocated) {
if (g_nz_workspace) {
size *= sizeof(int16_t); aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); }
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
g_nz_workspace_allocated = new_size;
std::vector<int64_t> strides(shape.size(), 1); }
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
} }
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
} }
/**
* @brief Convert tensor weights to NZ format using Ascend CANN API.
*
* This function creates a transposed tensor descriptor and performs the
* TransMatmulWeight operation. Converting tensor formats can significantly
* improve performance on certain hardware.
*
* @param tensor Pointer to the input ggml_tensor containing the weights.
* @param data Pointer to the raw data buffer for the tensor weights.
* @param offset Byte offset within the tensor data buffer where weights start.
*
* @note The workspace buffer used in this function is managed globally and reused
* across calls. This reduces overhead from repeated memory allocation and deallocation.
*/
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
aclrtStream stream; aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
ACL_CHECK(aclrtCreateStream(&stream)); tensor->nb, 2, ACL_FORMAT_ND, offset);
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
void *weightTransposedDeviceAddr = nullptr;
aclTensor *weightTransposed = nullptr;
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
ggml_cann_type_mapping(tensor->type), &weightTransposed);
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor *executor; aclOpExecutor *executor;
void *workspaceAddr = nullptr;
// TransMatmulWeight // TransMatmulWeight
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree); &workspaceSize, &executor));
if (workspaceSize > 0) { // Avoid frequent malloc/free of the workspace.
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); relloc_nz_workspace(workspaceSize);
workspaceAddrPtrTrans.reset(workspaceAddr);
}
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
aclrtMemcpy((char *)tensor->data + offset, size,
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
ACL_CHECK(aclDestroyTensor(weightTransposed)); ACL_CHECK(aclDestroyTensor(weightTransposed));
aclrtFree(weightTransposedDeviceAddr);
} }
// TODO: need handle tensor which has paddings. // TODO: need handle tensor which has paddings.
@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor(
// For acl, synchronous functions use this default stream. // For acl, synchronous functions use this default stream.
// Why aclrtSynchronizeDevice? // Why aclrtSynchronizeDevice?
bool weightToNZ = false; // Only check env once.
#ifdef ASCEND_310P static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
if (!need_transform(tensor->type)) { if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE)); ACL_MEMCPY_HOST_TO_DEVICE));
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
weight_format_to_nz(tensor, data, offset); weight_format_to_nz(tensor, data, offset);
} }
} else { } else {
@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
size_t size = ggml_nbytes(tensor); size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0]; int64_t ne0 = tensor->ne[0];
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
// last line must bigger than 32, because every single op deal at // last line must bigger than 32, because every single op deal at
// least 32 bytes. // least 32 bytes.
// TODO: quantized type? // TODO: quantized type?
// int64_t line_size = ne0 * ggml_element_size(tensor); // int64_t line_size = ne0 * ggml_element_size(tensor);
// int64_t line_size_align_32 = (line_size + 31) & ~31; // int64_t line_size_align_32 = (line_size + 31) & ~31;
// size += (line_size_align_32 - line_size); // size += (line_size_align_32 - line_size);
// TODO: not support quantized yet.
// TODO: consider un-continue tensor.
if (ggml_is_quantized(tensor->type)) { if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) { if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size( size += ggml_row_size(
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
} }
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
// NZ format weight are not support quantized yet.
// If ND tensor transform to NZ, size may changed.
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
size_t new_size;
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
ggml_cann_type_mapping(tensor->type), &new_size));
ACL_CHECK(aclDestroyIntArray(acl_shape));
size = std::max(size, new_size);
} }
return size; return size;
@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute(
(ggml_backend_cann_context*)backend->context; (ggml_backend_cann_context*)backend->context;
ggml_cann_set_device(cann_ctx->device); ggml_cann_set_device(cann_ctx->device);
//release temp buffer create by set tensor.
release_nz_workspace();
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor* node = cgraph->nodes[i]; ggml_tensor* node = cgraph->nodes[i];