mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md # ci/run.sh # flake.lock # flake.nix # ggml-cuda.cu # ggml-cuda.h # scripts/get-wikitext-2.sh # tests/CMakeLists.txt
This commit is contained in:
commit
1cb8a5e955
79 changed files with 6273 additions and 2982 deletions
|
@ -9,7 +9,11 @@
|
|||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
@ -322,6 +326,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
double nll = 0.0;
|
||||
double nll2 = 0.0;
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
if (num_batches > 1) {
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
@ -330,10 +341,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
|
@ -359,8 +366,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
// restore the original token in case it was set to BOS
|
||||
tokens[batch_start] = token_org;
|
||||
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
if (num_batches > 1) {
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
@ -389,7 +398,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||
// process the entire prompt.
|
||||
const int first = n_ctx/2;
|
||||
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
|
||||
|
@ -403,6 +413,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
@ -420,26 +432,73 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||
return {tokens, ppl, logit_history, prob_history};
|
||||
}
|
||||
|
||||
static std::vector<float> hellaswag_evaluate_tokens(
|
||||
llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
|
||||
) {
|
||||
std::vector<float> result;
|
||||
result.reserve(tokens.size() * n_vocab);
|
||||
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
||||
for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
|
||||
size_t n_tokens = tokens.size() - i_chunk * n_batch;
|
||||
n_tokens = std::min(n_tokens, size_t(n_batch));
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return {};
|
||||
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto logits = llama_get_logits(ctx);
|
||||
result.insert(result.end(), logits, logits + n_tokens * n_vocab);
|
||||
|
||||
n_past += n_tokens;
|
||||
memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define K_TOKEN_CHUNK 4
|
||||
|
||||
static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
|
||||
const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
|
||||
if (eval_results.size() != eval_pairs.size()) {
|
||||
eval_results.resize(eval_pairs.size());
|
||||
}
|
||||
if (eval_pairs.empty()) return;
|
||||
|
||||
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
|
||||
|
||||
std::atomic<int> counter(0);
|
||||
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
|
||||
float local_logprobs[K_TOKEN_CHUNK];
|
||||
while (true) {
|
||||
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
||||
if (first >= eval_results.size()) break;
|
||||
size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
|
||||
for (size_t i = first; i < last; ++i) {
|
||||
auto logits = batch_logits + eval_pairs[i].first * n_vocab;
|
||||
float max_logit = logits[0];
|
||||
for (int j = 1; j < n_vocab; ++j) {
|
||||
max_logit = std::max(max_logit, logits[j]);
|
||||
}
|
||||
float sum_p = 0.f;
|
||||
for (int j = 0; j < n_vocab; ++j) {
|
||||
sum_p += expf(logits[j] - max_logit);
|
||||
}
|
||||
local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p);
|
||||
}
|
||||
std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float));
|
||||
}
|
||||
};
|
||||
|
||||
for (size_t it = 0; it < max_threads; ++it) {
|
||||
workers[it] = std::thread(compute);
|
||||
}
|
||||
for (size_t it = 0; it < max_threads; ++it) {
|
||||
workers[it].join();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
@ -468,7 +527,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
prompt_lines.push_back(line);
|
||||
}
|
||||
|
||||
if( prompt_lines.size() % 6 != 0) {
|
||||
if (prompt_lines.size() % 6 != 0) {
|
||||
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
@ -483,7 +542,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
// Number of tasks to use when computing the score
|
||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||
if (params.hellaswag_tasks < hs_task_count) {
|
||||
hs_task_count = params.hellaswag_tasks;
|
||||
}
|
||||
|
||||
|
@ -500,27 +559,53 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
std::string ending[4];
|
||||
size_t ending_logprob_count[4];
|
||||
double ending_logprob[4];
|
||||
|
||||
size_t i_batch; // starting index in the llama_batch
|
||||
size_t common_prefix; // max number of initial tokens that are the same in all sentences
|
||||
size_t required_tokens; // needed number of tokens to evaluate all 4 endings
|
||||
std::vector<llama_token> seq_tokens[4];
|
||||
};
|
||||
|
||||
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||
|
||||
// Select and read data from prompt lines
|
||||
hs_data_t *hs_data = new hs_data_t[hs_task_count];
|
||||
for (size_t i=0; i < hs_task_count; i++) {
|
||||
std::vector<hs_data_t> hs_data(hs_task_count);
|
||||
for (size_t i = 0; i < hs_task_count; i++) {
|
||||
size_t idx = i;
|
||||
|
||||
auto & hs_cur = hs_data[i];
|
||||
|
||||
// Select a random example of those left in the prompt
|
||||
if (randomize_tasks) {
|
||||
std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
|
||||
idx = dist(rng);
|
||||
}
|
||||
|
||||
hs_data[i].context = prompt_lines[idx*6];
|
||||
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j=0; j < 4; j++) {
|
||||
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
|
||||
hs_cur.context = prompt_lines[idx*6];
|
||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j = 0; j < 4; j++) {
|
||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
|
||||
}
|
||||
|
||||
// determine the common prefix of the endings
|
||||
hs_cur.common_prefix = 0;
|
||||
for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) {
|
||||
if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] ||
|
||||
hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] ||
|
||||
hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) {
|
||||
break;
|
||||
}
|
||||
hs_cur.common_prefix++;
|
||||
}
|
||||
hs_cur.required_tokens = hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
|
||||
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
|
||||
|
||||
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
if (randomize_tasks) {
|
||||
prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
|
||||
|
@ -528,154 +613,426 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||
|
||||
printf("\ntask\tacc_norm\n");
|
||||
|
||||
double acc = 0.0f;
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
std::vector<std::vector<int>> ending_tokens(4);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
const int max_tasks_per_batch = 32;
|
||||
const int max_seq = 4*max_tasks_per_batch;
|
||||
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||
|
||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||
// Tokenize the context to count tokens
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||
size_t context_size = context_embd.size();
|
||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||
std::vector<float> eval_results;
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency());
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
|
||||
for (int k = 0; k < int(context_size); ++k) {
|
||||
if (ending_tokens[i][k] != context_embd[k]) {
|
||||
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
|
||||
break;
|
||||
for (size_t i0 = 0; i0 < hs_task_count; i0++) {
|
||||
int n_cur = 0;
|
||||
|
||||
size_t i1 = i0;
|
||||
size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// batch as much tasks as possible into the available context
|
||||
// each task has 4 unique seuqnce ids - one for each ending
|
||||
// the common prefix is shared among the 4 sequences to save tokens
|
||||
// we extract logits only from the last common token and from all ending tokens of each sequence
|
||||
while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
|
||||
auto & hs_cur = hs_data[i1];
|
||||
|
||||
const int s0 = 4*(i1 - i0);
|
||||
if (s0 + 4 > max_seq) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
||||
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
|
||||
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
|
||||
}
|
||||
}
|
||||
|
||||
hs_cur.i_batch = i_batch;
|
||||
i_batch += hs_cur.required_tokens;
|
||||
|
||||
n_cur += hs_data[i1].required_tokens;
|
||||
if (++i1 == hs_task_count) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Do the 1st ending
|
||||
// In this case we include the context when evaluating
|
||||
//auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||
auto query_embd = ending_tokens[0];
|
||||
auto query_size = query_embd.size();
|
||||
|
||||
// Stop if query wont fit the ctx window
|
||||
if (query_size > (size_t)n_ctx) {
|
||||
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
||||
if (i0 == i1) {
|
||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||
if (query_size < 32) {
|
||||
query_embd.resize(32);
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||
if (logits.empty()) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
|
||||
const auto first_probs = softmax(tok_logits);
|
||||
|
||||
hs_data[task_idx].ending_logprob_count[0] = 1;
|
||||
hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
|
||||
|
||||
// Calculate the logprobs over the ending
|
||||
for (size_t j = context_size; j < query_size - 1; j++) {
|
||||
|
||||
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
||||
|
||||
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||
|
||||
hs_data[task_idx].ending_logprob[0] += std::log(prob);
|
||||
hs_data[task_idx].ending_logprob_count[0]++;
|
||||
}
|
||||
|
||||
// Calculate the mean token logprob for acc_norm
|
||||
hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
|
||||
|
||||
// Do the remaining endings
|
||||
// For these, we use the bare ending with n_past = context_size
|
||||
//
|
||||
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
||||
|
||||
// Tokenize the query
|
||||
query_embd.resize(ending_tokens[ending_idx].size() - context_size);
|
||||
std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
|
||||
query_size = query_embd.size();
|
||||
|
||||
// Stop if query wont fit the ctx window
|
||||
if (context_size + query_size > (size_t)n_ctx) {
|
||||
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
|
||||
return;
|
||||
}
|
||||
|
||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||
// No, resizing to 32 is actually slightly slower (at least on CUDA)
|
||||
//if (query_size < 32) {
|
||||
// query_embd.resize(32);
|
||||
//}
|
||||
|
||||
// Evaluate the query
|
||||
logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
|
||||
if (logits.empty()) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
|
||||
hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
|
||||
|
||||
// Calculate the logprobs over the ending
|
||||
for (size_t j = 0; j < query_size - 1; j++) {
|
||||
std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
|
||||
|
||||
const float prob = softmax(tok_logits)[query_embd[j + 1]];
|
||||
|
||||
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
||||
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
||||
}
|
||||
|
||||
// Calculate the mean token logprob for acc_norm
|
||||
hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
|
||||
|
||||
|
||||
// printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
|
||||
// task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
|
||||
}
|
||||
|
||||
// Find the ending with maximum logprob
|
||||
size_t ending_logprob_max_idx = 0;
|
||||
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
|
||||
for (size_t j = 1; j < 4; j++) {
|
||||
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
||||
ending_logprob_max_idx = j;
|
||||
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
||||
// Compute log-probs in parallel
|
||||
// First we collect all tasks
|
||||
eval_pairs.clear();
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & hs_cur = hs_data[i];
|
||||
size_t li = hs_cur.common_prefix;
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
|
||||
}
|
||||
++li;
|
||||
}
|
||||
}
|
||||
// Then we do the actual calculation
|
||||
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||
|
||||
// printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
|
||||
size_t ir = 0;
|
||||
|
||||
// If the gold ending got the maximum logprobe add one accuracy point
|
||||
if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
|
||||
acc += 1.0;
|
||||
// compute the logprobs for each ending of the decoded tasks
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & hs_cur = hs_data[i];
|
||||
|
||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
|
||||
|
||||
const auto first_probs = softmax(tok_logits);
|
||||
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
hs_cur.ending_logprob_count[s] = 1;
|
||||
hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]);
|
||||
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||
hs_cur.ending_logprob[s] += eval_results[ir++];
|
||||
hs_cur.ending_logprob_count[s]++;
|
||||
}
|
||||
hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s];
|
||||
}
|
||||
|
||||
// Find the ending with maximum logprob
|
||||
size_t ending_logprob_max_idx = 0;
|
||||
double ending_logprob_max_val = hs_cur.ending_logprob[0];
|
||||
for (size_t s = 1; s < 4; s++) {
|
||||
if (hs_cur.ending_logprob[s] > ending_logprob_max_val) {
|
||||
ending_logprob_max_idx = s;
|
||||
ending_logprob_max_val = hs_cur.ending_logprob[s];
|
||||
}
|
||||
}
|
||||
|
||||
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||
|
||||
// If the gold ending got the maximum logprobe add one accuracy point
|
||||
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
||||
acc += 1.0;
|
||||
}
|
||||
|
||||
// Print the accumulated accuracy mean x 100
|
||||
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// Print the accumulated accuracy mean x 100
|
||||
printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
|
||||
fflush(stdout);
|
||||
i0 = i1 - 1;
|
||||
}
|
||||
|
||||
delete [] hs_data;
|
||||
llama_batch_free(batch);
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
struct winogrande_entry {
|
||||
std::string first;
|
||||
std::string second;
|
||||
std::array<std::string, 2> choices;
|
||||
int answer;
|
||||
|
||||
size_t i_batch;
|
||||
size_t common_prefix;
|
||||
size_t required_tokens;
|
||||
size_t n_base1; // number of tokens for context + choice 1
|
||||
size_t n_base2; // number of tokens for context + choice 2
|
||||
std::vector<llama_token> seq_tokens[2];
|
||||
};
|
||||
|
||||
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
|
||||
std::vector<winogrande_entry> result;
|
||||
std::istringstream in(prompt);
|
||||
std::string line;
|
||||
std::array<int, 4> comma_pos;
|
||||
while (true) {
|
||||
std::getline(in, line);
|
||||
if (in.fail() || in.eof()) break;
|
||||
int ipos = 0;
|
||||
bool quote_open = false;
|
||||
for (int i = 0; i < int(line.size()); ++i) {
|
||||
if (!quote_open) {
|
||||
if (line[i] == ',') {
|
||||
comma_pos[ipos++] = i;
|
||||
if (ipos == 4) break;
|
||||
}
|
||||
else if (line[i] == '"') {
|
||||
quote_open = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (line[i] == '"') {
|
||||
quote_open = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ipos != 4) {
|
||||
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||
continue;
|
||||
}
|
||||
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
||||
: line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1);
|
||||
auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1);
|
||||
auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1);
|
||||
auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1);
|
||||
auto index = line.substr(0, comma_pos[0]);
|
||||
int where = 0;
|
||||
for ( ; where < int(sentence.size()); ++where) {
|
||||
if (sentence[where] == '_') break;
|
||||
}
|
||||
if (where == int(sentence.size())) {
|
||||
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||
continue;
|
||||
}
|
||||
std::istringstream stream(answer.c_str());
|
||||
int i_answer; stream >> i_answer;
|
||||
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
||||
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||
continue;
|
||||
}
|
||||
result.emplace_back();
|
||||
auto& wg = result.back();
|
||||
wg.first = sentence.substr(0, where);
|
||||
wg.second = sentence.substr(where + 1, sentence.size() - where - 1);
|
||||
wg.choices[0] = std::move(choice1);
|
||||
wg.choices[1] = std::move(choice2);
|
||||
wg.answer = i_answer;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Evaluates the Winogrande score.
|
||||
* Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2)
|
||||
* You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp
|
||||
* As an example, the 1st row in the above dataset is
|
||||
*
|
||||
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
|
||||
*
|
||||
*/
|
||||
static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
constexpr int k_min_trailing_ctx = 3;
|
||||
|
||||
auto data = load_winogrande_from_csv(params.prompt);
|
||||
if (data.empty()) {
|
||||
fprintf(stderr, "%s: no tasks\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||
|
||||
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
||||
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||
std::mt19937 rng(1);
|
||||
std::vector<int> aux(data.size());
|
||||
for (int i = 0; i < int(data.size()); ++i) {
|
||||
aux[i] = i;
|
||||
}
|
||||
float scale = 1/(1.f + (float)rng.max());
|
||||
std::vector<winogrande_entry> selected;
|
||||
selected.resize(params.winogrande_tasks);
|
||||
for (int i = 0; i < int(params.winogrande_tasks); ++i) {
|
||||
int j = int(scale*rng()*aux.size());
|
||||
selected[i] = std::move(data[aux[j]]);
|
||||
aux[j] = aux.back();
|
||||
aux.pop_back();
|
||||
}
|
||||
data = std::move(selected);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
for (auto & task : data) {
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
|
||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
|
||||
|
||||
task.common_prefix = 0;
|
||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||
if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) {
|
||||
break;
|
||||
}
|
||||
task.common_prefix++;
|
||||
}
|
||||
|
||||
task.required_tokens = task.common_prefix +
|
||||
task.seq_tokens[0].size() - task.common_prefix +
|
||||
task.seq_tokens[1].size() - task.common_prefix;
|
||||
|
||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
const int max_tasks_per_batch = 128;
|
||||
const int max_seq = 2*max_tasks_per_batch;
|
||||
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||
|
||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||
std::vector<float> eval_results;
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency());
|
||||
|
||||
int n_correct = 0;
|
||||
int n_done = 0;
|
||||
|
||||
for (size_t i0 = 0; i0 < data.size(); i0++) {
|
||||
int n_cur = 0;
|
||||
|
||||
size_t i1 = i0;
|
||||
size_t i_batch = 0;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
||||
const int s0 = 2*(i1 - i0);
|
||||
if (s0 + 2 > max_seq) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
||||
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
|
||||
}
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
for (int s = 0; s < 2; ++s) {
|
||||
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
||||
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
||||
}
|
||||
}
|
||||
|
||||
data[i1].i_batch = i_batch;
|
||||
i_batch += data[i1].required_tokens;
|
||||
|
||||
n_cur += data[i1].required_tokens;
|
||||
if (++i1 == data.size()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
return;
|
||||
}
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
eval_pairs.clear();
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & task = data[i];
|
||||
|
||||
const bool skip_choice =
|
||||
task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
|
||||
task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
|
||||
|
||||
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
|
||||
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||
size_t li = n_base1 - 1;
|
||||
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
|
||||
}
|
||||
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
|
||||
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
|
||||
}
|
||||
}
|
||||
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||
|
||||
size_t ir = 0;
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & task = data[i];
|
||||
|
||||
const bool skip_choice =
|
||||
task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx &&
|
||||
task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx;
|
||||
|
||||
float score_1st = 0;
|
||||
const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
|
||||
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||
score_1st += eval_results[ir++];
|
||||
}
|
||||
score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st);
|
||||
|
||||
float score_2nd = 0;
|
||||
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||
score_2nd += eval_results[ir++];
|
||||
}
|
||||
score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd);
|
||||
|
||||
int result = score_1st > score_2nd ? 1 : 2;
|
||||
|
||||
if (result == task.answer) {
|
||||
++n_correct;
|
||||
}
|
||||
++n_done;
|
||||
|
||||
// print the accumulated accuracy mean x 100
|
||||
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
i0 = i1 - 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
if (n_done < 100) return;
|
||||
|
||||
const float p = 1.f*n_correct/n_done;
|
||||
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
||||
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
|
@ -733,6 +1090,8 @@ int main(int argc, char ** argv) {
|
|||
struct results_perplexity results;
|
||||
if (params.hellaswag) {
|
||||
hellaswag_score(ctx, params);
|
||||
} else if (params.winogrande) {
|
||||
winogrande_score(ctx, params);
|
||||
} else {
|
||||
results = perplexity(ctx, params);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue