koboldcpp/otherarch/acestep/backend.h

50 lines
1.9 KiB
C++

#pragma once
// backend.h: shared GGML backend initialization
//
// All modules use the same pattern: load all backends, pick best GPU,
// keep CPU as fallback. This avoids duplicating init logic across
// qwen3.h, qwen3-lm.h, cond.h, dit.h, vae.h.
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include <cstdio>
#include <cstring>
#include <thread>
struct BackendPair {
ggml_backend_t backend;
ggml_backend_t cpu_backend;
};
// Initialize backends: load all available (CUDA, Metal, Vulkan...),
// pick the best one, keep CPU as fallback.
// label: log prefix, e.g. "DiT", "VAE", "LM"
static BackendPair backend_init(const char * label) {
ggml_backend_load_all();
BackendPair bp = {};
bp.backend = ggml_backend_init_best();
int n_threads = (int)std::thread::hardware_concurrency() / 2;
if (n_threads < 1) n_threads = 1;
// [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances
// where only one gets the thread count)
bool best_is_cpu = (strcmp(ggml_backend_name(bp.backend), "CPU") == 0);
if (best_is_cpu) {
bp.cpu_backend = bp.backend;
ggml_backend_cpu_set_n_threads(bp.backend, n_threads);
} else {
// n_threads = (n_threads>4?4:n_threads);
bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads);
}
fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n",
label, ggml_backend_name(bp.backend), n_threads);
return bp;
}
// Create a scheduler from a backend pair.
// max_nodes: graph size hint (4096 for small models, 8192 for large)
static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) {
ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend };
int n = (bp.backend == bp.cpu_backend) ? 1 : 2;
return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
}