mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
triage for opencl
This commit is contained in:
parent
481f7a6fbc
commit
762eeb6204
4 changed files with 60 additions and 6 deletions
|
@ -934,10 +934,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
model_params.n_gpu_layers = inputs.gpulayers;
|
model_params.n_gpu_layers = inputs.gpulayers;
|
||||||
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
|
if(file_format==FileFormat::GGUF_GENERIC && model_params.n_gpu_layers>0)
|
||||||
{
|
{
|
||||||
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
|
if(file_format_meta.model_architecture == GGUFArch::FALCON)
|
||||||
model_params.n_gpu_layers = 0;
|
{
|
||||||
|
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
|
||||||
|
model_params.n_gpu_layers = 0;
|
||||||
|
}
|
||||||
|
else if(file_format_meta.model_architecture == GGUFArch::PHI || file_format_meta.n_expert_count>1)
|
||||||
|
{
|
||||||
|
printf("\nOpenCL cannot use regular GPU offloading for this model architecture. A fallback GPU offloader will be used with degraded performance.\n");
|
||||||
|
clblast_offload_fallback_mode = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
|
43
llama.cpp
43
llama.cpp
|
@ -180,6 +180,25 @@ static std::string format(const char * fmt, ...) {
|
||||||
return std::string(buf.data(), size);
|
return std::string(buf.data(), size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool clblast_offload_fallback_mode = false; //used when regular offload will segfault
|
||||||
|
static int clblast_offload_fallback_layers = 0;
|
||||||
|
static int layer_name_to_number(std::string inputString)
|
||||||
|
{
|
||||||
|
size_t firstDotPosition = inputString.find('.');
|
||||||
|
int converted = -1;
|
||||||
|
|
||||||
|
if (firstDotPosition != std::string::npos) {
|
||||||
|
size_t secondDotPosition = inputString.find('.', firstDotPosition + 1);
|
||||||
|
if (secondDotPosition != std::string::npos) {
|
||||||
|
std::string numbersPortion = inputString.substr(firstDotPosition + 1, secondDotPosition - firstDotPosition - 1);
|
||||||
|
try{converted = std::stoi(numbersPortion);}
|
||||||
|
catch (const std::invalid_argument& e) {}
|
||||||
|
catch (const std::out_of_range& e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return converted;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf constants (sync with gguf.py)
|
// gguf constants (sync with gguf.py)
|
||||||
//
|
//
|
||||||
|
@ -2628,6 +2647,19 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
if(clblast_offload_fallback_mode)
|
||||||
|
{
|
||||||
|
int layernum = layer_name_to_number(cur->name);
|
||||||
|
bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum);
|
||||||
|
if(shouldoffload)
|
||||||
|
{
|
||||||
|
cur->backend = GGML_BACKEND_GPU;
|
||||||
|
ggml_cl_transform_tensor(cur->data, cur);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
size_done += ggml_nbytes(cur);
|
size_done += ggml_nbytes(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3330,7 +3362,14 @@ static bool llm_load_tensors(
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
||||||
|
|
||||||
|
if(clblast_offload_fallback_mode)
|
||||||
|
{
|
||||||
|
printf("\nOpenCL GPU Offload Fallback...");
|
||||||
|
clblast_offload_fallback_layers = n_gpu_layers;
|
||||||
|
i_gpu_start = std::max((int64_t) hparams.n_layer, (int64_t) 0);
|
||||||
|
}
|
||||||
|
|
||||||
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
||||||
model.buft_input = llama_default_buffer_type_cpu(true);
|
model.buft_input = llama_default_buffer_type_cpu(true);
|
||||||
|
@ -3401,7 +3440,7 @@ static bool llm_load_tensors(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
// assign the output layer
|
// assign the output layer
|
||||||
if (n_gpu_layers > n_layer) {
|
if (n_gpu_layers > n_layer && !clblast_offload_fallback_mode) {
|
||||||
model.buft_output = {
|
model.buft_output = {
|
||||||
split_buft,
|
split_buft,
|
||||||
llama_default_buffer_type_offload(main_gpu)
|
llama_default_buffer_type_offload(main_gpu)
|
||||||
|
|
|
@ -272,10 +272,16 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
if(modelarch!="" && fileformatmeta!=nullptr)
|
if(modelarch!="" && fileformatmeta!=nullptr)
|
||||||
{
|
{
|
||||||
std::string fkey = modelarch+".context_length";
|
std::string fkey = modelarch+".context_length";
|
||||||
auto keyidx = gguf_find_key(ctx, fkey.c_str());
|
int keyidx = gguf_find_key(ctx, fkey.c_str());
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
fileformatmeta->n_ctx_train = gguf_get_val_u32(ctx, keyidx);
|
fileformatmeta->n_ctx_train = gguf_get_val_u32(ctx, keyidx);
|
||||||
}
|
}
|
||||||
|
fkey = modelarch+".expert_count";
|
||||||
|
keyidx = gguf_find_key(ctx, fkey.c_str());
|
||||||
|
if (keyidx != -1) {
|
||||||
|
fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
|
||||||
|
}
|
||||||
|
|
||||||
int filever = gguf_get_version(ctx);
|
int filever = gguf_get_version(ctx);
|
||||||
fileformatmeta->fileversion = filever;
|
fileformatmeta->fileversion = filever;
|
||||||
fileformatmeta->model_architecture = GGUFArch::DEFAULT;
|
fileformatmeta->model_architecture = GGUFArch::DEFAULT;
|
||||||
|
|
|
@ -62,6 +62,7 @@ struct FileFormatExtraMeta
|
||||||
int n_ctx_train = 2048;
|
int n_ctx_train = 2048;
|
||||||
int fileversion = 0;
|
int fileversion = 0;
|
||||||
GGUFArch model_architecture = GGUFArch::DEFAULT;
|
GGUFArch model_architecture = GGUFArch::DEFAULT;
|
||||||
|
int n_expert_count = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ModelLoadResult
|
enum ModelLoadResult
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue