mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
better abort handling, added support for dynatemp exponent
This commit is contained in:
parent
f96f29be7b
commit
08236ccc97
7 changed files with 110 additions and 50 deletions
|
@ -89,6 +89,7 @@ struct gpt_params {
|
||||||
|
|
||||||
// DynaTemp!
|
// DynaTemp!
|
||||||
float dynatemp_range = 0.0f; // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
|
float dynatemp_range = 0.0f; // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
|
||||||
|
float dynatemp_exponent = 1.0f;
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
|
1
expose.h
1
expose.h
|
@ -82,6 +82,7 @@ struct generation_inputs
|
||||||
const bool grammar_retain_state;
|
const bool grammar_retain_state;
|
||||||
const bool quiet = false;
|
const bool quiet = false;
|
||||||
const float dynatemp_range = 0.0f;
|
const float dynatemp_range = 0.0f;
|
||||||
|
const float dynatemp_exponent = 1.0f;
|
||||||
const logit_bias logit_biases[logit_bias_max];
|
const logit_bias logit_biases[logit_bias_max];
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -482,7 +482,7 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng,
|
int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng,
|
||||||
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dynatemp_range)
|
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dynatemp_range, float dynatemp_exponent)
|
||||||
{
|
{
|
||||||
int id = 0;
|
int id = 0;
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
|
@ -548,7 +548,8 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers
|
||||||
//do not allow negative values
|
//do not allow negative values
|
||||||
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
||||||
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
||||||
llama_sample_entropy(nullptr, &candidates_p, temp, dynatemp_min, dynatemp_max);
|
dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
|
||||||
|
llama_sample_entropy(nullptr, &candidates_p, temp, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1517,6 +1518,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
kcpp_params->mirostat_eta = inputs.mirostat_eta;
|
kcpp_params->mirostat_eta = inputs.mirostat_eta;
|
||||||
kcpp_params->mirostat_tau = inputs.mirostat_tau;
|
kcpp_params->mirostat_tau = inputs.mirostat_tau;
|
||||||
kcpp_params->dynatemp_range = inputs.dynatemp_range;
|
kcpp_params->dynatemp_range = inputs.dynatemp_range;
|
||||||
|
kcpp_params->dynatemp_exponent = inputs.dynatemp_exponent;
|
||||||
kcpp_params->n_ctx = inputs.max_context_length;
|
kcpp_params->n_ctx = inputs.max_context_length;
|
||||||
kcpp_params->n_batch = n_batch;
|
kcpp_params->n_batch = n_batch;
|
||||||
kcpp_params->n_threads = n_threads;
|
kcpp_params->n_threads = n_threads;
|
||||||
|
@ -1913,6 +1915,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
const float typical_p = kcpp_params->typical_p;
|
const float typical_p = kcpp_params->typical_p;
|
||||||
const float tfs_z = kcpp_params->tfs_z;
|
const float tfs_z = kcpp_params->tfs_z;
|
||||||
const float dynatemp_range = kcpp_params->dynatemp_range;
|
const float dynatemp_range = kcpp_params->dynatemp_range;
|
||||||
|
const float dynatemp_exponent = kcpp_params->dynatemp_exponent;
|
||||||
|
|
||||||
if (!startedsampling)
|
if (!startedsampling)
|
||||||
{
|
{
|
||||||
|
@ -1968,7 +1971,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
|
|
||||||
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty,
|
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty,
|
||||||
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
|
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
|
||||||
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range);
|
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range, dynatemp_exponent);
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
if (grammar != nullptr) {
|
||||||
grammar_accept_token(file_format, n_vocab, grammar, id);
|
grammar_accept_token(file_format, n_vocab, grammar, id);
|
||||||
|
|
120
klite.embd
120
klite.embd
File diff suppressed because one or more lines are too long
22
koboldcpp.py
22
koboldcpp.py
|
@ -79,6 +79,7 @@ class generation_inputs(ctypes.Structure):
|
||||||
("grammar_retain_state", ctypes.c_bool),
|
("grammar_retain_state", ctypes.c_bool),
|
||||||
("quiet", ctypes.c_bool),
|
("quiet", ctypes.c_bool),
|
||||||
("dynatemp_range", ctypes.c_float),
|
("dynatemp_range", ctypes.c_float),
|
||||||
|
("dynatemp_exponent", ctypes.c_float),
|
||||||
("logit_biases", logit_bias * logit_bias_max)]
|
("logit_biases", logit_bias * logit_bias_max)]
|
||||||
|
|
||||||
class generation_outputs(ctypes.Structure):
|
class generation_outputs(ctypes.Structure):
|
||||||
|
@ -311,7 +312,7 @@ def load_model(model_filename):
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, logit_biases={}):
|
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, logit_biases={}):
|
||||||
global maxctx, args, currentusergenkey, totalgens
|
global maxctx, args, currentusergenkey, totalgens
|
||||||
inputs = generation_inputs()
|
inputs = generation_inputs()
|
||||||
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
||||||
|
@ -340,6 +341,7 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
|
||||||
inputs.stream_sse = stream_sse
|
inputs.stream_sse = stream_sse
|
||||||
inputs.quiet = quiet
|
inputs.quiet = quiet
|
||||||
inputs.dynatemp_range = dynatemp_range
|
inputs.dynatemp_range = dynatemp_range
|
||||||
|
inputs.dynatemp_exponent = dynatemp_exponent
|
||||||
inputs.grammar = grammar.encode("UTF-8")
|
inputs.grammar = grammar.encode("UTF-8")
|
||||||
inputs.grammar_retain_state = grammar_retain_state
|
inputs.grammar_retain_state = grammar_retain_state
|
||||||
inputs.unban_tokens_rt = not use_default_badwordsids
|
inputs.unban_tokens_rt = not use_default_badwordsids
|
||||||
|
@ -558,6 +560,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
trimstop=genparams.get('trim_stop', False),
|
trimstop=genparams.get('trim_stop', False),
|
||||||
quiet=is_quiet,
|
quiet=is_quiet,
|
||||||
dynatemp_range=genparams.get('dynatemp_range', 0.0),
|
dynatemp_range=genparams.get('dynatemp_range', 0.0),
|
||||||
|
dynatemp_exponent=genparams.get('dynatemp_exponent', 1.0),
|
||||||
logit_biases=genparams.get('logit_bias', {})
|
logit_biases=genparams.get('logit_bias', {})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -652,8 +655,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
await self.send_oai_sse_event('[DONE]')
|
await self.send_oai_sse_event('[DONE]')
|
||||||
break
|
break
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print("SSE streaming was interrupted due to an exception")
|
print("Token streaming was interrupted or aborted!")
|
||||||
print(ex)
|
print(ex)
|
||||||
|
handle.abort_generate()
|
||||||
|
time.sleep(0.2) #short delay
|
||||||
|
|
||||||
# flush buffers, sleep a bit to make sure all data sent, and then force close the connection
|
# flush buffers, sleep a bit to make sure all data sent, and then force close the connection
|
||||||
self.wfile.flush()
|
self.wfile.flush()
|
||||||
|
@ -665,17 +670,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
async def handle_request(self, genparams, api_format, stream_flag):
|
async def handle_request(self, genparams, api_format, stream_flag):
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
if stream_flag:
|
|
||||||
tasks.append(self.handle_sse_stream(api_format))
|
|
||||||
|
|
||||||
generate_task = asyncio.create_task(self.generate_text(genparams, api_format, stream_flag))
|
|
||||||
tasks.append(generate_task)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
if stream_flag:
|
||||||
|
tasks.append(self.handle_sse_stream(api_format))
|
||||||
|
|
||||||
|
generate_task = asyncio.create_task(self.generate_text(genparams, api_format, stream_flag))
|
||||||
|
tasks.append(generate_task)
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
generate_result = generate_task.result()
|
generate_result = generate_task.result()
|
||||||
return generate_result
|
return generate_result
|
||||||
except (BrokenPipeError, ConnectionAbortedError) as cae: # attempt to abort if connection lost
|
except (BrokenPipeError, ConnectionAbortedError) as cae: # attempt to abort if connection lost
|
||||||
|
print("An ongoing connection was aborted or interrupted!")
|
||||||
print(cae)
|
print(cae)
|
||||||
handle.abort_generate()
|
handle.abort_generate()
|
||||||
time.sleep(0.2) #short delay
|
time.sleep(0.2) #short delay
|
||||||
|
|
|
@ -8479,12 +8479,12 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
||||||
llama_sample_temp(ctx, candidates_p, temp);
|
llama_sample_temp(ctx, candidates_p, temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f) {
|
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f, float dynatemp_exponent = 1.0f) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
llama_sample_softmax(ctx, candidates_p);
|
llama_sample_softmax(ctx, candidates_p);
|
||||||
|
|
||||||
float exponent_val = 1.0f;
|
float exponent_val = dynatemp_exponent;
|
||||||
|
|
||||||
// Calculate entropy of the softmax probabilities
|
// Calculate entropy of the softmax probabilities
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -771,7 +771,8 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep,
|
size_t min_keep,
|
||||||
float min_temp,
|
float min_temp,
|
||||||
float max_temp);
|
float max_temp,
|
||||||
|
float dynatemp_exponent);
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||||
LLAMA_API void llama_sample_tail_free(
|
LLAMA_API void llama_sample_tail_free(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue