mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Backwards compatibility formats all done
Merge branch 'master' into concedo # Conflicts: # CMakeLists.txt # README.md # llama.cpp
This commit is contained in:
commit
559a1967f7
21 changed files with 832 additions and 494 deletions
84
extra.cpp
84
extra.cpp
|
@ -18,7 +18,7 @@
|
|||
#include <alloca.h>
|
||||
#endif
|
||||
|
||||
//return val: 0=fail, 1=legacy, 2=newformat
|
||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||
int check_file_format(const std::string & fname)
|
||||
{
|
||||
std::vector<char> f_buf(1024*1024);
|
||||
|
@ -33,16 +33,94 @@
|
|||
int fileformat = 0;
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
|
||||
if (magic == 0x67676d6c) { //v1 format ggml, alpaca
|
||||
fileformat = 1;
|
||||
}else{
|
||||
}
|
||||
else if(magic == 0x67676d66) //v2 format ggmf
|
||||
{
|
||||
fileformat = 2;
|
||||
}
|
||||
else if(magic == 0x67676a74) //v3 format ggjt
|
||||
{
|
||||
fileformat = 3; //ggjt by default
|
||||
}
|
||||
fin.close();
|
||||
|
||||
return fileformat;
|
||||
}
|
||||
|
||||
//freeze all the configurations for model loading for v1 and v2 formats
|
||||
struct llama_context * legacy_llama_init_from_file(const char * path_model, struct llama_context_params params)
|
||||
{
|
||||
ggml_time_init();
|
||||
|
||||
llama_context * ctx = new llama_context;
|
||||
|
||||
if (params.seed <= 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
ctx->rng = std::mt19937(params.seed);
|
||||
ctx->logits_all = params.logits_all;
|
||||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!legacy_llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
|
||||
params.vocab_only, params.progress_callback,
|
||||
params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.use_mlock) {
|
||||
char *err;
|
||||
if (!ggml_mlock(ctx->model.ctx,
|
||||
ctx->model.mm_addr,
|
||||
ctx->model.mm_length,
|
||||
&err)) {
|
||||
fprintf(stderr, "%s\n", err);
|
||||
free(err);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
{
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
|
||||
// resized during inference
|
||||
if (params.logits_all) {
|
||||
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
||||
} else {
|
||||
ctx->logits.reserve(hparams.n_ctx);
|
||||
}
|
||||
|
||||
if (params.embedding){
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
//legacy llama model format v1 and v2 loader. there is a lot of duplicate code,
|
||||
//but it may be better to freeze it as such rather than risk tiny breaking changes
|
||||
static bool legacy_llama_model_load(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue