Backwards compatibility formats all done

Merge branch 'master' into concedo

# Conflicts:
#	CMakeLists.txt
#	README.md
#	llama.cpp
This commit is contained in:
Concedo 2023-03-31 19:01:33 +08:00
commit 559a1967f7
21 changed files with 832 additions and 494 deletions

View file

@ -18,7 +18,7 @@
#include <alloca.h>
#endif
//return val: 0=fail, 1=legacy, 2=newformat
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
int check_file_format(const std::string & fname)
{
std::vector<char> f_buf(1024*1024);
@ -33,16 +33,94 @@
int fileformat = 0;
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
if (magic == 0x67676d6c) { //v1 format ggml, alpaca
fileformat = 1;
}else{
}
else if(magic == 0x67676d66) //v2 format ggmf
{
fileformat = 2;
}
else if(magic == 0x67676a74) //v3 format ggjt
{
fileformat = 3; //ggjt by default
}
fin.close();
return fileformat;
}
//freeze all the configurations for model loading for v1 and v2 formats
struct llama_context * legacy_llama_init_from_file(const char * path_model, struct llama_context_params params)
{
ggml_time_init();
llama_context * ctx = new llama_context;
if (params.seed <= 0) {
params.seed = time(NULL);
}
ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!legacy_llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
llama_free(ctx);
return nullptr;
}
if (params.use_mlock) {
char *err;
if (!ggml_mlock(ctx->model.ctx,
ctx->model.mm_addr,
ctx->model.mm_length,
&err)) {
fprintf(stderr, "%s\n", err);
free(err);
llama_free(ctx);
return nullptr;
}
}
// reserve memory for context buffers
{
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
return nullptr;
}
{
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
const auto & hparams = ctx->model.hparams;
// resized during inference
if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else {
ctx->logits.reserve(hparams.n_ctx);
}
if (params.embedding){
ctx->embedding.resize(hparams.n_embd);
}
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
}
return ctx;
}
//legacy llama model format v1 and v2 loader. there is a lot of duplicate code,
//but it may be better to freeze it as such rather than risk tiny breaking changes
static bool legacy_llama_model_load(