mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
gpu layer offloading disabled for phi models in clblast
This commit is contained in:
parent
0a70cc1ba7
commit
d9a7bd577a
4 changed files with 38 additions and 36 deletions
|
@ -169,13 +169,9 @@ extern "C"
|
||||||
{
|
{
|
||||||
printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GGUF_FALCON)
|
|
||||||
{
|
|
||||||
printf("\n---\nIdentified as FALCON model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
printf("\n---\nIdentified as GGUF model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||||
}
|
}
|
||||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
|
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
|
||||||
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
|
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
|
||||||
|
|
|
@ -141,7 +141,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
|
||||||
{
|
{
|
||||||
return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
|
return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
else if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
return std::string(llama_token_to_str(llama_ctx_v4, id));
|
return std::string(llama_token_to_str(llama_ctx_v4, id));
|
||||||
}
|
}
|
||||||
|
@ -153,7 +153,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
|
||||||
|
|
||||||
static void TokenizeString(const std::string & str_to_tokenize, std::vector<int> & output_tokens, FileFormat file_format)
|
static void TokenizeString(const std::string & str_to_tokenize, std::vector<int> & output_tokens, FileFormat file_format)
|
||||||
{
|
{
|
||||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
|
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
|
||||||
{
|
{
|
||||||
|
@ -182,9 +182,9 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
|
||||||
{
|
{
|
||||||
unsigned int eosID = 0;
|
unsigned int eosID = 0;
|
||||||
|
|
||||||
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
eosID = llama_token_eos(&(llama_ctx_v4->model));
|
eosID = llama_token_eos(&(llama_ctx_v4->model));
|
||||||
}
|
}
|
||||||
|
@ -696,7 +696,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
file_format = in_file_format;
|
file_format = in_file_format;
|
||||||
n_threads = kcpp_params->n_threads = inputs.threads;
|
n_threads = kcpp_params->n_threads = inputs.threads;
|
||||||
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
|
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
|
||||||
bool isGguf = (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON);
|
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
||||||
|
|
||||||
n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize);
|
n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize);
|
||||||
modelname = kcpp_params->model = inputs.model_filename;
|
modelname = kcpp_params->model = inputs.model_filename;
|
||||||
|
@ -712,7 +712,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
auto clamped_max_context_length = inputs.max_context_length;
|
auto clamped_max_context_length = inputs.max_context_length;
|
||||||
|
|
||||||
if(clamped_max_context_length>16384 &&
|
if(clamped_max_context_length>16384 &&
|
||||||
file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
|
file_format != FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");
|
printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");
|
||||||
clamped_max_context_length = 16384;
|
clamped_max_context_length = 16384;
|
||||||
|
@ -748,7 +748,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
//approximate NTK aware ctx
|
//approximate NTK aware ctx
|
||||||
auto effectivenctx = kcpp_params->n_ctx;
|
auto effectivenctx = kcpp_params->n_ctx;
|
||||||
if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && file_format_meta.n_ctx_train > 2048)
|
if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
|
||||||
{
|
{
|
||||||
float factor = file_format_meta.n_ctx_train/2048;
|
float factor = file_format_meta.n_ctx_train/2048;
|
||||||
effectivenctx = effectivenctx/factor;
|
effectivenctx = effectivenctx/factor;
|
||||||
|
@ -781,7 +781,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
|
|
||||||
printf("System Info: %s\n", llama_print_system_info());
|
printf("System Info: %s\n", llama_print_system_info());
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
if(file_format!=FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
|
if(file_format!=FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
|
||||||
{
|
{
|
||||||
|
@ -915,7 +915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
else if(file_format==FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
llama_context_params llama_ctx_params = llama_context_default_params();
|
llama_context_params llama_ctx_params = llama_context_default_params();
|
||||||
|
@ -932,10 +932,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
model_params.use_mmap = inputs.use_mmap;
|
model_params.use_mmap = inputs.use_mmap;
|
||||||
model_params.use_mlock = inputs.use_mlock;
|
model_params.use_mlock = inputs.use_mlock;
|
||||||
model_params.n_gpu_layers = inputs.gpulayers;
|
model_params.n_gpu_layers = inputs.gpulayers;
|
||||||
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if(file_format==FileFormat::GGUF_FALCON && model_params.n_gpu_layers>0)
|
if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
|
||||||
{
|
{
|
||||||
printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n");
|
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
|
||||||
model_params.n_gpu_layers = 0;
|
model_params.n_gpu_layers = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1642,13 +1643,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bool triggersc = useSmartContext;
|
bool triggersc = useSmartContext;
|
||||||
if(useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
|
if(useContextShift && (file_format == FileFormat::GGUF_GENERIC))
|
||||||
{
|
{
|
||||||
PurgeMissingTokens(llama_ctx_v4, current_context_tokens, embd_inp, inputs.max_length, nctx);
|
PurgeMissingTokens(llama_ctx_v4, current_context_tokens, embd_inp, inputs.max_length, nctx);
|
||||||
triggersc = false;
|
triggersc = false;
|
||||||
}
|
}
|
||||||
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
|
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
|
||||||
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
|
llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
|
||||||
}
|
}
|
||||||
|
@ -1669,7 +1670,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
//for non llama, limit to 256
|
//for non llama, limit to 256
|
||||||
int bbs = blasbatchsize;
|
int bbs = blasbatchsize;
|
||||||
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
|
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
|
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
|
||||||
}
|
}
|
||||||
|
@ -1821,7 +1822,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
|
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
else if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
evalres = (llama_decode(llama_ctx_v4, llama_batch_get_one(embd.data(), embdsize, n_past, 0))==0);
|
evalres = (llama_decode(llama_ctx_v4, llama_batch_get_one(embd.data(), embdsize, n_past, 0))==0);
|
||||||
}
|
}
|
||||||
|
@ -1934,9 +1935,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
float * logitsPtr;
|
float * logitsPtr;
|
||||||
float lowestLogit = 0;
|
float lowestLogit = 0;
|
||||||
int btsize = banned_token_ids.size();
|
int btsize = banned_token_ids.size();
|
||||||
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
|
if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
logitsPtr = llama_get_logits(llama_ctx_v4);
|
logitsPtr = llama_get_logits(llama_ctx_v4);
|
||||||
}
|
}
|
||||||
|
|
|
@ -255,7 +255,7 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
else if(magic == 0x46554747)
|
else if(magic == 0x46554747)
|
||||||
{
|
{
|
||||||
fin.close();
|
fin.close();
|
||||||
fileformat = FileFormat::GGUF_LLAMA;
|
fileformat = FileFormat::GGUF_GENERIC;
|
||||||
|
|
||||||
struct gguf_init_params ggufparams;
|
struct gguf_init_params ggufparams;
|
||||||
ggufparams.no_alloc = true;
|
ggufparams.no_alloc = true;
|
||||||
|
@ -267,19 +267,8 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
std::string modelarch = "";
|
std::string modelarch = "";
|
||||||
if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); }
|
if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); }
|
||||||
|
|
||||||
if(modelarch=="llama")
|
|
||||||
{
|
|
||||||
fileformat = FileFormat::GGUF_LLAMA;
|
|
||||||
}
|
|
||||||
else if(modelarch=="falcon")
|
|
||||||
{
|
|
||||||
fileformat = FileFormat::GGUF_FALCON; //uses the same loader
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
printf("\nThe reported GGUF Arch is: %s\n",(modelarch==""?"unknown":modelarch.c_str()));
|
printf("\nThe reported GGUF Arch is: %s\n",(modelarch==""?"unknown":modelarch.c_str()));
|
||||||
|
|
||||||
|
|
||||||
if(modelarch!="" && fileformatmeta!=nullptr)
|
if(modelarch!="" && fileformatmeta!=nullptr)
|
||||||
{
|
{
|
||||||
std::string fkey = modelarch+".context_length";
|
std::string fkey = modelarch+".context_length";
|
||||||
|
@ -289,6 +278,15 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
}
|
}
|
||||||
int filever = gguf_get_version(ctx);
|
int filever = gguf_get_version(ctx);
|
||||||
fileformatmeta->fileversion = filever;
|
fileformatmeta->fileversion = filever;
|
||||||
|
fileformatmeta->model_architecture = GGUFArch::DEFAULT;
|
||||||
|
if(modelarch=="phi2")
|
||||||
|
{
|
||||||
|
fileformatmeta->model_architecture = GGUFArch::PHI;
|
||||||
|
}
|
||||||
|
else if(modelarch=="falcon")
|
||||||
|
{
|
||||||
|
fileformatmeta->model_architecture = GGUFArch::FALCON;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,8 @@ enum FileFormat
|
||||||
GGJT=3, // 3=(llama ggjt)
|
GGJT=3, // 3=(llama ggjt)
|
||||||
GGJT_2=4, //newer llama format unshuffled
|
GGJT_2=4, //newer llama format unshuffled
|
||||||
GGJT_3=5, //using 16bit scalar
|
GGJT_3=5, //using 16bit scalar
|
||||||
GGUF_LLAMA=6, //GGUF (llama newest ver)
|
|
||||||
|
GGUF_GENERIC=6, //GGUF (llama newest ver)
|
||||||
|
|
||||||
GPTJ_1=100, //the very first super old GPTJ format
|
GPTJ_1=100, //the very first super old GPTJ format
|
||||||
GPTJ_2=101, //pygmalion, uses old ggml lib
|
GPTJ_2=101, //pygmalion, uses old ggml lib
|
||||||
|
@ -47,14 +48,20 @@ enum FileFormat
|
||||||
|
|
||||||
MPT_1=500, //first supported mpt version
|
MPT_1=500, //first supported mpt version
|
||||||
|
|
||||||
GGUF_FALCON=600, //GGUF (falcon)
|
};
|
||||||
|
|
||||||
|
enum GGUFArch
|
||||||
|
{
|
||||||
|
DEFAULT = 0, //used for llama and other generic gguf
|
||||||
|
FALCON = 1,
|
||||||
|
PHI = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FileFormatExtraMeta
|
struct FileFormatExtraMeta
|
||||||
{
|
{
|
||||||
int n_ctx_train = 2048;
|
int n_ctx_train = 2048;
|
||||||
int fileversion = 0;
|
int fileversion = 0;
|
||||||
|
GGUFArch model_architecture = GGUFArch::DEFAULT;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ModelLoadResult
|
enum ModelLoadResult
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue