#include "ggml.h" #include "llamaextra.h" #include "llama.cpp" #include #include #include #include #include #include #include #include #include #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) #include #endif //freeze all the configurations for model loading for v1 and v2 formats struct llama_context * legacy_llama_init_from_file(const char * path_model, struct llama_context_params params) { ggml_time_init(); llama_context * ctx = new llama_context; if (params.seed <= 0) { params.seed = time(NULL); } ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!legacy_llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); return nullptr; } if (params.use_mlock) { char *err; if (!ggml_mlock(ctx->model.ctx, ctx->model.mm_addr, ctx->model.mm_length, &err)) { fprintf(stderr, "%s\n", err); free(err); llama_free(ctx); return nullptr; } } // reserve memory for context buffers { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } const auto & hparams = ctx->model.hparams; // resized during inference if (params.logits_all) { ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); } else { ctx->logits.reserve(hparams.n_ctx); } if (params.embedding){ ctx->embedding.resize(hparams.n_embd); } ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); } return ctx; } //legacy llama model format v1 and v2 loader. there is a lot of duplicate code, //but it may be better to freeze it as such rather than risk tiny breaking changes static bool legacy_llama_model_load( const std::string & fname, llama_context & lctx, int n_ctx, int n_parts, ggml_type memory_type, bool vocab_only, llama_progress_callback progress_callback, void *progress_callback_user_data) { fprintf(stderr, "%s: Legacy loading model from '%s' - please wait ...\n", __func__, fname.c_str()); const int64_t t_start_us = ggml_time_us(); lctx.t_start_us = t_start_us; std::vector f_buf(1024*1024); auto & model = lctx.model; auto & vocab = lctx.vocab; auto fin = std::ifstream(fname, std::ios::binary); fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; } bool legacy_file_format = false; // verify magic { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); if (magic == 0x67676d6c) { // 'ggml' in hex, very first version fprintf(stderr, "%s: very old v1 model file '%s' (please regenerate your model files if you can!)\n", __func__, fname.c_str()); legacy_file_format = true; } else { if (magic != 0x67676d66) { // 'ggmf' in hex, second version fprintf(stderr, "%s: invalid legacy model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } uint32_t format_version; fin.read((char *) &format_version, sizeof(format_version)); uint32_t v2_format_version = 1; if (format_version != v2_format_version) { fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n", __func__, fname.c_str(), format_version, v2_format_version); return false; } } } int n_ff = 0; // load hparams { auto & hparams = model.hparams; fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.f16, sizeof(hparams.f16)); hparams.n_ctx = n_ctx; n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; if (n_parts < 1) { n_parts = LLAMA_N_PARTS.at(hparams.n_embd); } // temp warning to tell the user to use "--n_parts" if (hparams.f16 == 4 && n_parts != 1) { fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); } if (hparams.n_layer == 32) { model.type = e_model::MODEL_7B; } if (hparams.n_layer == 40) { model.type = e_model::MODEL_13B; } if (hparams.n_layer == 60) { model.type = e_model::MODEL_30B; } if (hparams.n_layer == 80) { model.type = e_model::MODEL_65B; } fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); fprintf(stderr, "%s: type = %d\n", __func__, model.type); } // load vocab { std::string word; vocab.id_to_token.resize(model.hparams.n_vocab); std::vector tmp(64); int32_t vocabloops = model.hparams.n_vocab; if(vocabloops==32001 && legacy_file_format) { printf("---\n!! WARNING: Model appears to be GPT4ALL v1 model, triggering compatibility fix !!\n---\n"); vocabloops -= 1; } for (int i = 0; i < vocabloops; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); word.resize(len); if (len > 0) { tmp.resize(len); fin.read(tmp.data(), len); word.assign(tmp.data(), len); } else { word.clear(); } float score; if(!legacy_file_format) { fin.read((char *) &score, sizeof(score)); } vocab.token_to_id[word] = i; auto &tok_score = vocab.id_to_token[i]; tok_score.tok = word; tok_score.score = score; } } if (vocab_only) { return true; } // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation // wtype is for per-layer weights, while vtype is for other weights ggml_type wtype, vtype; switch (model.hparams.f16) { case 0: wtype = vtype = GGML_TYPE_F32; break; case 1: wtype = vtype = GGML_TYPE_F16; break; case 2: wtype = vtype = GGML_TYPE_Q4_0; break; case 3: wtype = vtype = GGML_TYPE_Q4_1; break; case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", __func__, fname.c_str(), model.hparams.f16); return false; } } auto & ctx = model.ctx; size_t ctx_size = 0; { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v ctx_size += (5 + 10*n_layer)*256; // object overhead fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // print memory requirements { const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; // this is the total memory required to run the inference const size_t mem_required = ctx_size + MEM_REQ_SCRATCH0.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_EVAL.at (model.type); // this is the memory required by one llama_state const size_t mem_required_state = scale*MEM_REQ_KV_SELF.at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); } // create the ggml context { lctx.model.buf.resize(ctx_size); struct ggml_init_params params = { /*.mem_size =*/ lctx.model.buf.size(), /*.mem_buffer =*/ lctx.model.buf.data(), }; model.ctx = ggml_init(params); if (!model.ctx) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } } // prepare memory for the weights { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_vocab = hparams.n_vocab; model.layers.resize(n_layer); model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); // map by name model.tensors["tok_embeddings.weight"] = model.tok_embeddings; model.tensors["norm.weight"] = model.norm; model.tensors["output.weight"] = model.output; for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); // map by name model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; } } const size_t file_offset = fin.tellg(); fin.close(); std::vector tmp; if (progress_callback) { progress_callback(0.0, progress_callback_user_data); } for (int i = 0; i < n_parts; ++i) { const int part_id = i; //const int part_id = n_parts - i - 1; std::string fname_part = fname; if (i > 0) { fname_part += "." + std::to_string(i); } fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); fin = std::ifstream(fname_part, std::ios::binary); fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); fin.seekg(0, fin.end); const size_t file_size = fin.tellg(); fin.seekg(file_offset); // load weights { size_t total_size = 0; model.n_loaded = 0; fprintf(stderr, "%s: ", __func__); while (true) { int32_t n_dims; int32_t length; int32_t ftype; fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast(&length), sizeof(length)); fin.read(reinterpret_cast(&ftype), sizeof(ftype)); if (fin.eof()) { break; } int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; } std::string name(length, 0); fin.read(&name[0], length); if (model.tensors.find(name.data()) == model.tensors.end()) { fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); return false; } // split_type = 0: split by columns // split_type = 1: split by rows int split_type = 0; // split_type = 0: // regex: // - tok_embeddings.* // - layers.*.attention.wo.weight // - layers.*.feed_forward.w2.weight // split_type = 1: // regex: // - output.* // - layers.*.attention.wq.weight // - layers.*.attention.wk.weight // - layers.*.attention.wv.weight // - layers.*.feed_forward.w1.weight // - layers.*.feed_forward.w3.weight if (name.find("tok_embeddings") != std::string::npos) { split_type = 0; } else if (name.find("layers") != std::string::npos) { if (name.find("attention.wo.weight") != std::string::npos) { split_type = 0; } else if (name.find("feed_forward.w2.weight") != std::string::npos) { split_type = 0; } else { split_type = 1; } } else if (name.find("output") != std::string::npos) { split_type = 1; } auto tensor = model.tensors[name.data()]; if (n_dims == 1) { if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } } else { if (ggml_nelements(tensor)/n_parts != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } } if (n_dims == 1) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return false; } } else { if (split_type == 0) { if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); return false; } } else { if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); return false; } } } if (0) { static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); } size_t bpe = 0; switch (ftype) { case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; default: { fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); return false; } }; if (n_dims == 1 || n_parts == 1) { if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; } if (part_id == 0) { fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); } else { fin.seekg(ggml_nbytes(tensor), std::ios::cur); } total_size += ggml_nbytes(tensor); } else { if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); return false; } if (split_type == 0) { const int np0 = ne[0]; const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); assert(row_size == tensor->nb[1]); for (int i1 = 0; i1 < ne[1]; ++i1) { const size_t offset_row = i1*row_size; const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); fin.read(reinterpret_cast(tensor->data) + offset, row_size/n_parts); } } else { const int np1 = ne[1]; const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); for (int i1 = 0; i1 < ne[1]; ++i1) { const size_t offset_row = (i1 + part_id*np1)*row_size; fin.read(reinterpret_cast(tensor->data) + offset_row, row_size); } } total_size += ggml_nbytes(tensor)/n_parts; } //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); model.n_loaded++; // progress if (progress_callback) { float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset); float current_progress = (float(i) + current_file_progress) / float(n_parts); progress_callback(current_progress, progress_callback_user_data); } if (model.n_loaded % 8 == 0) { fprintf(stderr, "."); fflush(stderr); } } fprintf(stderr, " done\n"); fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); if (model.n_loaded == 0) { fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); } else if (model.n_loaded != (int) model.tensors.size()) { fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); return false; } } fin.close(); } lctx.t_load_us = ggml_time_us() - t_start_us; if (progress_callback) { progress_callback(1.0, progress_callback_user_data); } return true; } // TODO: Calculate this constant from the vocabulary #define MAX_TOKEN_LEN 18 // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece std::vector legacy_llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { std::vector res; std::vector score; std::vector prev; int len = text.length(); score.resize(len + 1); prev.resize(len + 1); // Forward pass for (int i = 0; i < len; i++) { int max_len = std::min(len - i, MAX_TOKEN_LEN); for (int sub_len = 1; sub_len <= max_len; sub_len++) { auto sub = text.substr(i, sub_len); auto token = vocab.token_to_id.find(sub); if (token != vocab.token_to_id.end()) { int token_score = sub.length() * sub.length(); int local_score = score[i] + token_score; int next = i + sub_len; if (score[next] < local_score) { score[next] = local_score; prev[next] = (*token).second; } } } } // Backward pass int i = len; while (i > 0) { llama_token token_id = prev[i]; if (token_id == 0) { // TODO: Return error or something more meaningful printf("failed to tokenize string!\n"); break; } res.push_back(token_id); auto token = vocab.id_to_token[token_id].tok; i -= token.length(); } if (bos) { res.push_back(1); // TODO: replace with vocab.bos } // Pieces are in reverse order so correct that std::reverse(res.begin(), res.end()); return res; } int legacy_llama_tokenize( struct llama_context * ctx, const char * text, llama_token * tokens, int n_max_tokens, bool add_bos) { auto res = legacy_llama_tokenize(ctx->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); return -((int) res.size()); } for (size_t i = 0; i < res.size(); i++) { tokens[i] = res[i]; } return res.size(); } std::vector legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { std::vector res(8096); int n = legacy_llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); res.resize(n); return res; }