mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit 'df270ef745
' into concedo_experimental
# Conflicts: # Makefile # common/CMakeLists.txt # common/common.h # common/sampling.cpp # common/sampling.h # examples/infill/infill.cpp # examples/llama-bench/llama-bench.cpp # examples/quantize-stats/quantize-stats.cpp # examples/server/server.cpp # include/llama.h # src/llama-sampling.cpp # src/llama-sampling.h # src/llama.cpp # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
12fd16bfd4
86 changed files with 3406 additions and 7795 deletions
|
@ -4,12 +4,12 @@
|
|||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
params.prompt = "The quick brown fox";
|
||||
params.sparams.seed = 1234;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
@ -39,6 +39,13 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
// tokenize prompt
|
||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
|
@ -65,18 +72,11 @@ int main(int argc, char ** argv) {
|
|||
printf("\nfirst run: %s", params.prompt.c_str());
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto * logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||
|
||||
llama_sampler_accept(smpl, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
|
||||
|
@ -97,6 +97,11 @@ int main(int argc, char ** argv) {
|
|||
// make new context
|
||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
|
||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
printf("\nsecond run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
|
@ -125,17 +130,11 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto * logits = llama_get_logits(ctx2);
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||
|
||||
llama_sampler_accept(smpl2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
|
||||
|
@ -158,7 +157,12 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// make new context
|
||||
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
|
||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
|
@ -216,17 +220,11 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// third run with seq 1 instead of 0
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto * logits = llama_get_logits(ctx3);
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||
|
||||
llama_sampler_accept(smpl3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
|
@ -241,6 +239,10 @@ int main(int argc, char ** argv) {
|
|||
|
||||
printf("\n");
|
||||
|
||||
llama_sampler_free(smpl);
|
||||
llama_sampler_free(smpl2);
|
||||
llama_sampler_free(smpl3);
|
||||
|
||||
llama_free(ctx3);
|
||||
llama_free_model(model);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue