mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-09 19:46:11 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # README.md # ggml-metal.m # llama.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp
This commit is contained in:
commit
ac43e0115c
28 changed files with 1225 additions and 309 deletions
|
|
@ -20,11 +20,11 @@ static std::vector<std::string> split_lines(const std::string & s) {
|
|||
|
||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
||||
for (size_t i = 0; i < tokens.size(); i++) {
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
|
||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
static void normalize(float * vec, float * out, int n) {
|
||||
static void normalize(const float * vec, float * out, int n) {
|
||||
float norm = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
norm += vec[i] * vec[i];
|
||||
|
|
@ -46,10 +46,23 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||
}
|
||||
|
||||
// normalize on copy
|
||||
for (int k = 0; k < n_seq; k++) {
|
||||
float * emb = llama_get_embeddings_ith(ctx, k);
|
||||
float * out = output + k * n_embd;
|
||||
normalize(emb, out, n_embd);
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
if (!batch.logits[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
if (embd == NULL) {
|
||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||
normalize(embd, out, n_embd);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -133,7 +146,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// initialize batch
|
||||
const int n_prompts = prompts.size();
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
// allocate output
|
||||
const int n_embd = llama_n_embd(model);
|
||||
|
|
@ -146,6 +159,7 @@ int main(int argc, char ** argv) {
|
|||
for (int k = 0; k < n_prompts; k++) {
|
||||
// clamp to n_batch tokens
|
||||
auto & inp = inputs[k];
|
||||
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
|
|
|
|||
|
|
@ -512,6 +512,14 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
// tokenized antiprompts
|
||||
std::vector<std::vector<llama_token>> antiprompt_ids;
|
||||
|
||||
antiprompt_ids.reserve(params.antiprompt.size());
|
||||
for (const std::string & antiprompt : params.antiprompt) {
|
||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||
}
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
|
|
@ -770,6 +778,18 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// check for reverse prompt using special tokens
|
||||
llama_token last_token = llama_sampling_last(ctx_sampling);
|
||||
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||
if (ids.size() == 1 && last_token == ids[0]) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
}
|
||||
is_antiprompt = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_antiprompt) {
|
||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
||||
}
|
||||
|
|
|
|||
34
examples/server-embd.py
Normal file
34
examples/server-embd.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import asyncio
|
||||
import requests
|
||||
import numpy as np
|
||||
|
||||
n = 8
|
||||
|
||||
result = []
|
||||
|
||||
async def requests_post_async(*args, **kwargs):
|
||||
return await asyncio.to_thread(requests.post, *args, **kwargs)
|
||||
|
||||
async def main():
|
||||
model_url = "http://127.0.0.1:6900"
|
||||
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||
url= f"{model_url}/embedding",
|
||||
json= {"content": str(i)*1024}
|
||||
) for i in range(n)])
|
||||
|
||||
for response in responses:
|
||||
embedding = response.json()["embedding"]
|
||||
print(embedding[-8:])
|
||||
result.append(embedding)
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
# compute cosine similarity
|
||||
|
||||
for i in range(n-1):
|
||||
for j in range(i+1, n):
|
||||
embedding1 = np.array(result[i])
|
||||
embedding2 = np.array(result[j])
|
||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
||||
|
||||
|
|
@ -414,7 +414,7 @@ struct llama_server_context
|
|||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
sparams.chat_template = "chatml";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1211,7 +1211,7 @@ struct llama_server_context
|
|||
queue_results.send(res);
|
||||
}
|
||||
|
||||
void send_embedding(server_slot &slot)
|
||||
void send_embedding(server_slot & slot, const llama_batch & batch)
|
||||
{
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
|
|
@ -1220,6 +1220,7 @@ struct llama_server_context
|
|||
res.stop = true;
|
||||
|
||||
const int n_embd = llama_n_embd(model);
|
||||
|
||||
if (!params.embedding)
|
||||
{
|
||||
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
|
||||
|
|
@ -1230,12 +1231,29 @@ struct llama_server_context
|
|||
}
|
||||
else
|
||||
{
|
||||
const float *data = llama_get_embeddings(ctx);
|
||||
std::vector<float> embedding(data, data + n_embd);
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", embedding},
|
||||
};
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
if (embd == NULL) {
|
||||
LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
||||
};
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||
};
|
||||
}
|
||||
}
|
||||
queue_results.send(res);
|
||||
}
|
||||
|
|
@ -1846,7 +1864,7 @@ struct llama_server_context
|
|||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
|
|
@ -1882,7 +1900,7 @@ struct llama_server_context
|
|||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
||||
{
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
|
|
@ -1955,7 +1973,7 @@ struct llama_server_context
|
|||
// prompt evaluated for embedding
|
||||
if (slot.embedding)
|
||||
{
|
||||
send_embedding(slot);
|
||||
send_embedding(slot, batch_view);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
continue;
|
||||
|
|
@ -2037,6 +2055,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --pooling {none,mean,cls}\n");
|
||||
printf(" pooling type for embeddings, use model default if unspecified\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
|
|
@ -2277,6 +2297,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--pooling")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
||||
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
else if (arg == "--threads" || arg == "-t")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
|
@ -2331,7 +2363,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
}
|
||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||
{
|
||||
|
|
|
|||
|
|
@ -6,3 +6,4 @@ More info:
|
|||
|
||||
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||
- https://github.com/ggerganov/llama.cpp/pull/3624
|
||||
- https://github.com/ggerganov/llama.cpp/pull/5625
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
|
@ -20,6 +21,7 @@ struct seq_draft {
|
|||
std::vector<int> i_batch_tgt;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
std::vector<std::vector<llama_token_data>> dists;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling;
|
||||
};
|
||||
|
|
@ -39,12 +41,15 @@ int main(int argc, char ** argv) {
|
|||
// max number of parallel drafting sequences (i.e. tree branches)
|
||||
const int n_seq_dft = params.n_parallel;
|
||||
|
||||
// probability threshold for accepting a token from the draft model
|
||||
const float p_accept = params.p_accept;
|
||||
|
||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||
const float p_split = params.p_split;
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
std::default_random_engine rng(params.seed);
|
||||
std::uniform_real_distribution<> u_dist;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("speculative", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
|
|
@ -168,7 +173,9 @@ int main(int argc, char ** argv) {
|
|||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
if (params.sparams.temp == 0) {
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
}
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
|
@ -184,12 +191,15 @@ int main(int argc, char ** argv) {
|
|||
drafts[0].i_batch_tgt[0] = 0;
|
||||
|
||||
while (true) {
|
||||
std::set<int> active_seqs = {};
|
||||
|
||||
// print current draft sequences
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
if (!drafts[s].active) {
|
||||
continue;
|
||||
}
|
||||
|
||||
active_seqs.insert(s);
|
||||
const auto & tokens = drafts[s].tokens;
|
||||
|
||||
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
||||
|
|
@ -198,48 +208,156 @@ int main(int argc, char ** argv) {
|
|||
int i_dft = 0;
|
||||
int s_keep = 0;
|
||||
|
||||
llama_token token_id;
|
||||
std::string token_str;
|
||||
|
||||
// loop until we fail to accept a drafted token or we run out of drafted tokens
|
||||
while (true) {
|
||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
// sample from the target model
|
||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
++n_predict;
|
||||
|
||||
// check if the target token matches any of the drafts
|
||||
// for stochastic sampling, attempt to match the token with the drafted tokens
|
||||
{
|
||||
bool matches = false;
|
||||
bool accept = false;
|
||||
if (params.sparams.temp > 0) {
|
||||
// stochastic verification
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
if (!drafts[s].active) {
|
||||
continue;
|
||||
llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
float p_tgt = 0, p_dft = 0;
|
||||
|
||||
// GGML_ASSERT(dist_tgt.size() == dist_dft.size());
|
||||
|
||||
while (active_seqs.size() > 0) {
|
||||
// randomly select a sequence to verify from active sequences
|
||||
std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
|
||||
int s = *std::next(active_seqs.begin(), u_int_dist(rng));
|
||||
if (i_dft >= (int) drafts[s].tokens.size()) {
|
||||
drafts[s].active = false;
|
||||
active_seqs.erase(s);
|
||||
continue;
|
||||
}
|
||||
if (accept) {
|
||||
// if we already accepted a token, we can skip the rest
|
||||
if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
|
||||
drafts[s].active = false;
|
||||
active_seqs.erase(s);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||
float r = u_dist(rng);
|
||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
|
||||
// acquire the token probabilities assigned by the draft and target models
|
||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
||||
p_tgt = dist_tgt.data[i].p;
|
||||
}
|
||||
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
|
||||
p_dft = dist_dft.data[i].p;
|
||||
}
|
||||
if (p_tgt && p_dft) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
||||
if (r <= p_tgt / p_dft) {
|
||||
s_keep = s;
|
||||
accept = true;
|
||||
token_id = drafts[s].tokens[i_dft];
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
|
||||
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
break;
|
||||
} else {
|
||||
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||
drafts[s].active = false;
|
||||
|
||||
// calculate residual probability
|
||||
GGML_ASSERT(dist_tgt.sorted);
|
||||
GGML_ASSERT(dist_dft.sorted);
|
||||
float sum_probs = 0.0f;
|
||||
|
||||
// sort dist by id
|
||||
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||
return a.id < b.id;
|
||||
});
|
||||
std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||
return a.id < b.id;
|
||||
});
|
||||
|
||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
|
||||
sum_probs += dist_tgt.data[i].p;
|
||||
}
|
||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||
dist_tgt.data[i].p /= sum_probs;
|
||||
}
|
||||
|
||||
// sort dist_tgt by p desc
|
||||
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||
return a.p > b.p;
|
||||
});
|
||||
}
|
||||
|
||||
active_seqs.erase(s);
|
||||
for(int i = 0; i < n_seq_dft; i++) {
|
||||
if (i == s) {
|
||||
continue;
|
||||
}
|
||||
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
||||
// synchronize active status for sequences with the same drafted token
|
||||
drafts[i].active = drafts[i].active && accept;
|
||||
if (!drafts[i].active) {
|
||||
active_seqs.erase(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
|
||||
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
|
||||
if (!accept) {
|
||||
// all drafted tokens were rejected
|
||||
// sample from the target model
|
||||
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
}
|
||||
|
||||
s_keep = s;
|
||||
matches = true;
|
||||
} else {
|
||||
drafts[s].active = false;
|
||||
} else {
|
||||
// greedy verification
|
||||
|
||||
// sample from the target model
|
||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
if (!drafts[s].active) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
||||
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
|
||||
s_keep = s;
|
||||
accept = true;
|
||||
} else {
|
||||
drafts[s].active = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (matches) {
|
||||
if (token_id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
}
|
||||
++n_predict;
|
||||
|
||||
if (accept) {
|
||||
++n_accept;
|
||||
++n_past_tgt;
|
||||
++n_past_dft;
|
||||
|
|
@ -247,17 +365,21 @@ int main(int argc, char ** argv) {
|
|||
if (params.use_color) {
|
||||
// Color token according to its origin sequence
|
||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||
fflush(stdout);
|
||||
} else {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
continue;
|
||||
} else {
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
{
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
||||
|
||||
// TODO: simplify
|
||||
{
|
||||
|
|
@ -277,21 +399,21 @@ int main(int argc, char ** argv) {
|
|||
drafts[s].active = false;
|
||||
drafts[s].tokens.clear();
|
||||
drafts[s].i_batch_tgt.clear();
|
||||
drafts[s].dists.clear();
|
||||
}
|
||||
// note: will be erased after the speculation phase
|
||||
drafts[0].tokens.push_back(id);
|
||||
drafts[0].tokens.push_back(token_id);
|
||||
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
||||
drafts[0].i_batch_tgt.push_back(0);
|
||||
|
||||
llama_batch_clear(batch_dft);
|
||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode (ctx_dft, batch_dft);
|
||||
llama_decode(ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_predict > params.n_predict || has_eos) {
|
||||
|
|
@ -336,12 +458,6 @@ int main(int argc, char ** argv) {
|
|||
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
||||
}
|
||||
|
||||
if (cur_p[0].p < p_accept) {
|
||||
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
|
||||
drafts[s].drafting = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<int> sa(1, s);
|
||||
|
||||
// attempt to split the branch if the probability is high enough
|
||||
|
|
@ -369,6 +485,7 @@ int main(int argc, char ** argv) {
|
|||
drafts[n_seq_cur].skip = true;
|
||||
|
||||
drafts[n_seq_cur].tokens = drafts[s].tokens;
|
||||
drafts[n_seq_cur].dists = drafts[s].dists;
|
||||
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
||||
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
||||
|
||||
|
|
@ -391,6 +508,8 @@ int main(int argc, char ** argv) {
|
|||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||
|
||||
drafts[s].tokens.push_back(id);
|
||||
// save cur_p.data into drafts[s].dists
|
||||
drafts[s].dists.push_back(cur_p);
|
||||
|
||||
// add unique drafted tokens to the target batch
|
||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||
|
|
@ -442,6 +561,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
drafts[s].tokens.erase(drafts[s].tokens.begin());
|
||||
drafts[s].dists.erase(drafts[s].dists.begin());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue