mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
gpu offload not working for other arch. debug in future.
This commit is contained in:
parent
57230b5196
commit
2c6ac06936
3 changed files with 94 additions and 5 deletions
|
@ -18,7 +18,7 @@
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
|
||||
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
|
@ -328,6 +328,48 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
|
||||
fin.close();
|
||||
|
||||
// //gpu offload for gptj
|
||||
// #if defined(GGML_USE_CLBLAST)
|
||||
// if(gpulayers>0)
|
||||
// {
|
||||
// const auto & hparams = model.hparams;
|
||||
// const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||
// if(GetQuantsUnshuffled())
|
||||
// {
|
||||
// SetGPULayers(n_gpu);
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
// size_t vram_total = 0;
|
||||
|
||||
// for (int i = 0; i < n_gpu; ++i) {
|
||||
// const auto & layer = model.layers[i];
|
||||
|
||||
// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
|
||||
// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
|
||||
// }
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// if(n_gpu>0)
|
||||
// {
|
||||
// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// #endif
|
||||
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue