mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-17 04:19:40 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md # common/log.h
This commit is contained in:
commit
eed651494e
26 changed files with 1143 additions and 658 deletions
|
@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
|
|||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
|
||||
struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
|
||||
opt_params_adam.print_forward_graph = false;
|
||||
opt_params_adam.print_backward_graph = false;
|
||||
opt_params_lbfgs.print_forward_graph = false;
|
||||
opt_params_lbfgs.print_backward_graph = false;
|
||||
opt_params_adam.adam.n_iter = 16;
|
||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||
// ggml_opt(ctx0, opt_params_adam, e);
|
||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||
//
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
|
|
|
@ -22,7 +22,9 @@
|
|||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
|
|||
assert(0u < beams_state.n_beams);
|
||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||
printf("%lu", n);
|
||||
printf("%zu", n);
|
||||
}
|
||||
fflush(stdout);
|
||||
#if 1 // DEBUG: print current beams for this iteration
|
||||
|
@ -145,7 +147,7 @@ int main(int argc, char ** argv)
|
|||
|
||||
if (tokens_list.size() > max_tokens_list_size)
|
||||
{
|
||||
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
|
||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -75,7 +75,7 @@ typedef struct {
|
|||
int seq_len; // max sequence length
|
||||
} Config;
|
||||
|
||||
typedef struct {
|
||||
struct TransformerWeights {
|
||||
// token embedding table
|
||||
float* token_embedding_table; // (vocab_size, dim)
|
||||
// weights for rmsnorms
|
||||
|
@ -97,7 +97,22 @@ typedef struct {
|
|||
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||
// (optional) classifier weights for the logits, on the last layer
|
||||
float* wcls;
|
||||
} TransformerWeights;
|
||||
|
||||
~TransformerWeights() {
|
||||
delete[] token_embedding_table;
|
||||
delete[] rms_att_weight;
|
||||
delete[] rms_ffn_weight;
|
||||
delete[] wq;
|
||||
delete[] wk;
|
||||
delete[] wv;
|
||||
delete[] wo;
|
||||
delete[] w1;
|
||||
delete[] w2;
|
||||
delete[] w3;
|
||||
delete[] rms_final_weight;
|
||||
delete[] wcls;
|
||||
}
|
||||
};
|
||||
|
||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
|
@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
|
|||
return 0;
|
||||
}
|
||||
|
||||
void free_weights(TransformerWeights* w) {
|
||||
delete w->token_embedding_table;
|
||||
delete w->rms_att_weight;
|
||||
delete w->rms_ffn_weight;
|
||||
delete w->wq;
|
||||
delete w->wk;
|
||||
delete w->wv;
|
||||
delete w->wo;
|
||||
delete w->w1;
|
||||
delete w->w2;
|
||||
delete w->w3;
|
||||
delete w->rms_final_weight;
|
||||
if (w->wcls) delete w->wcls;
|
||||
}
|
||||
|
||||
void print_sample_weights(TransformerWeights *w){
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
|
@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
|||
// assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
||||
llama_file file(filename, "rb");
|
||||
if (!file.fp) {
|
||||
fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
|
||||
exit(1);
|
||||
}
|
||||
const int n_vocab = config->vocab_size;
|
||||
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
|
@ -633,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
|||
}
|
||||
}
|
||||
|
||||
void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
|
||||
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int ct;
|
||||
switch (gg_weights->n_dims){
|
||||
case 1:
|
||||
|
@ -670,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
|
|||
}
|
||||
|
||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
||||
// stuff AK weights into GG weights one by one.
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
|
||||
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
|
@ -686,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
|||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
|
||||
convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
|
||||
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||
}
|
||||
|
||||
struct gguf_context * ctx = gguf_init_empty();
|
||||
|
@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
|||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
size_t pos = path.find_last_of("/");
|
||||
size_t pos = path.find_last_of("/\\");
|
||||
if (pos == std::string::npos) {
|
||||
return path;
|
||||
}
|
||||
|
@ -911,7 +915,7 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
Config config;
|
||||
TransformerWeights weights;
|
||||
TransformerWeights weights = {};
|
||||
{
|
||||
FILE *file = fopen(params.fn_llama2c_model, "rb");
|
||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||
|
@ -953,6 +957,5 @@ int main(int argc, char ** argv) {
|
|||
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
|
||||
ggml_free(model.ctx);
|
||||
free_weights(&weights);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ For an interactive experience, try this command:
|
|||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||
'User: Hi
|
||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||
User: Sure!
|
||||
|
@ -45,7 +45,7 @@ User:'
|
|||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
|
|
@ -33,6 +33,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
|
||||
|
@ -69,12 +71,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
fprintf(stderr, "\nAllowed quantization types:\n");
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
|
||||
if (it.name != "COPY") {
|
||||
printf(" %2d or ", it.ftype);
|
||||
} else {
|
||||
printf(" ");
|
||||
}
|
||||
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
@ -119,6 +126,9 @@ int main(int argc, char ** argv) {
|
|||
// export as [inp path]/ggml-model-[ftype].gguf
|
||||
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
|
||||
arg_idx++;
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fname_out = argv[arg_idx];
|
||||
|
@ -131,6 +141,10 @@ int main(int argc, char ** argv) {
|
|||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
} else {
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
}
|
||||
arg_idx++;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#include "completion.js.hpp"
|
||||
#include "json-schema-to-grammar.mjs.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
|
@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama)
|
|||
{
|
||||
const auto timings = llama_get_timings(llama.ctx);
|
||||
|
||||
assert(timings.n_eval == llama.num_tokens_predicted);
|
||||
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
|
||||
|
||||
return json{
|
||||
{"prompt_n", timings.n_p_eval},
|
||||
|
@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
|||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
||||
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
||||
printf("%lu", n);
|
||||
printf("%zu", n);
|
||||
}
|
||||
fflush(stdout);
|
||||
#if 0 // DEBUG: print current beams for this iteration
|
||||
|
@ -1377,7 +1379,13 @@ int main(int argc, char **argv)
|
|||
}
|
||||
}
|
||||
|
||||
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
||||
auto probs = llama.generated_token_probs;
|
||||
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
|
||||
const json data = format_final_response(llama, llama.generated_text, probs);
|
||||
|
||||
llama_print_timings(llama.ctx);
|
||||
|
||||
|
@ -1454,7 +1462,11 @@ int main(int argc, char **argv)
|
|||
|
||||
if (!llama.has_next_token) {
|
||||
// Generation is done, send extra information.
|
||||
const json data = format_final_response(llama, "", llama.generated_token_probs);
|
||||
const json data = format_final_response(
|
||||
llama,
|
||||
"",
|
||||
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
|
||||
);
|
||||
|
||||
const std::string str =
|
||||
"data: " +
|
||||
|
@ -1548,7 +1560,7 @@ int main(int argc, char **argv)
|
|||
|
||||
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
|
||||
{
|
||||
const auto * fmt = "500 Internal Server Error\n%s";
|
||||
const char fmt[] = "500 Internal Server Error\n%s";
|
||||
char buf[BUFSIZ];
|
||||
try {
|
||||
std::rethrow_exception(std::move(ep));
|
||||
|
|
|
@ -2,13 +2,16 @@
|
|||
# train-text-from-scratch checkpoint --> gguf conversion
|
||||
|
||||
import argparse
|
||||
import gguf
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
# gguf constants
|
||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue