mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # examples/batched/batched.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # src/llama-context.cpp # tools/cli/README.md # tools/completion/README.md # tools/server/README.md
This commit is contained in:
commit
0d43bdc46d
26 changed files with 906 additions and 421 deletions
|
|
@ -1731,6 +1731,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--adaptive-target"}, "N",
|
||||
string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
|
||||
"to 1.0; negative = disabled) (default: %.2f)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
|
||||
(double)params.sampling.adaptive_target),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.adaptive_target = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--adaptive-decay"}, "N",
|
||||
string_format("adaptive-p: decay rate for target adaptation over time. lower values "
|
||||
"are more reactive, higher values are more stable.\n"
|
||||
"(valid range 0.0 to 0.99) (default: %.2f)",
|
||||
(double)params.sampling.adaptive_decay),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.adaptive_decay = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dynatemp-range"}, "N",
|
||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||
|
|
|
|||
|
|
@ -1180,7 +1180,6 @@ common_init_result::common_init_result(common_params & params) :
|
|||
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
||||
}
|
||||
|
||||
// TODO: temporarily gated behind a flag
|
||||
if (params.sampling.backend_sampling) {
|
||||
cparams.samplers = pimpl->samplers_seq_config.data();
|
||||
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
||||
|
|
|
|||
|
|
@ -115,6 +115,7 @@ enum common_sampler_type {
|
|||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
|
|
@ -162,32 +163,34 @@ enum common_params_sampling_config : uint64_t {
|
|||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float top_n_sigma = -1.00f;// -1.0 = disabled
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
||||
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
||||
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float top_n_sigma = -1.00f; // -1.0 = disabled
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
|
||||
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
||||
|
|
|
|||
|
|
@ -167,11 +167,11 @@ std::string common_params_sampling::print() const {
|
|||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
||||
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
|
||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
||||
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
|
||||
mirostat, mirostat_eta, mirostat_tau);
|
||||
mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
|
||||
|
||||
return std::string(result);
|
||||
}
|
||||
|
|
@ -255,6 +255,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
|||
}
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
|
||||
bool use_adaptive_p = false; // see below
|
||||
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
|
|
@ -264,43 +267,54 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
|||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||
samplers.push_back(llama_sampler_init_top_k(params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||
samplers.push_back(llama_sampler_init_infill(vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
|
||||
// the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
|
||||
// a single token, so we will add `dist` at the end of the chain by default,
|
||||
// unless the user specifically included `adaptive-p`. we set this flag here
|
||||
// so we know to add the sampler at the very end.
|
||||
use_adaptive_p = true;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
|
||||
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||
if (use_adaptive_p) {
|
||||
// only if user explicitly included adaptive-p sampler
|
||||
samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
|
||||
} else {
|
||||
// default: sample from distribution
|
||||
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||
}
|
||||
} else if (params.mirostat == 1) {
|
||||
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
|
|
@ -334,15 +348,21 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
|||
}
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
|
|
@ -355,6 +375,10 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
|||
}
|
||||
|
||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
}
|
||||
|
||||
gsmpl->reset();
|
||||
}
|
||||
|
||||
|
|
@ -415,6 +439,10 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
}
|
||||
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||
if (!gsmpl) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return gsmpl->chain;
|
||||
}
|
||||
|
||||
|
|
@ -611,6 +639,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
|
||||
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a';
|
||||
default : return '?';
|
||||
}
|
||||
}
|
||||
|
|
@ -627,6 +656,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
|
||||
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p";
|
||||
default : return "";
|
||||
}
|
||||
}
|
||||
|
|
@ -643,6 +673,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
|
||||
};
|
||||
|
||||
// since samplers names are written multiple ways
|
||||
|
|
@ -658,6 +689,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|||
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
|
||||
};
|
||||
|
||||
std::vector<common_sampler_type> samplers;
|
||||
|
|
@ -694,6 +726,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P), COMMON_SAMPLER_TYPE_ADAPTIVE_P },
|
||||
};
|
||||
|
||||
std::vector<common_sampler_type> samplers;
|
||||
|
|
|
|||
|
|
@ -654,6 +654,14 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|||
vec_extract(x[0], 2) + \
|
||||
vec_extract(x[0], 3); \
|
||||
}
|
||||
#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
|
||||
{ \
|
||||
vector float v = vec_add(vec_add(s0, s1), \
|
||||
vec_add(s2, s3)); \
|
||||
v = vec_add(v, vec_sld(v, v, 8)); \
|
||||
v = vec_add(v, vec_sld(v, v, 4)); \
|
||||
res += (ggml_float) vec_extract(v, 0); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
|
|
@ -690,6 +698,29 @@ static inline unsigned char ggml_endian_byte(int i) {
|
|||
r[i - GGML_ENDIAN_BYTE(0)]), \
|
||||
0, p - GGML_F16_EPR)
|
||||
|
||||
//BF16 POWER9
|
||||
#define GGML_BF16_STEP 16
|
||||
#define GGML_BF16_EPR 8
|
||||
|
||||
#define GGML_BF16x8 vector unsigned short
|
||||
#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
|
||||
#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
|
||||
|
||||
#define GGML_BF16_VEC GGML_BF16x8
|
||||
#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
|
||||
#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
|
||||
#if defined(__LITTLE_ENDIAN__)
|
||||
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
|
||||
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
|
||||
#else
|
||||
#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
|
||||
#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
|
||||
#endif
|
||||
#define GGML_BF16_FMA_LO(acc, x, y) \
|
||||
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
|
||||
#define GGML_BF16_FMA_HI(acc, x, y) \
|
||||
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
|
||||
|
||||
#elif defined(__wasm_simd128__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
|
|
|||
|
|
@ -237,6 +237,24 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|||
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
||||
|
||||
#endif
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
const int np = (n & ~(GGML_BF16_STEP - 1));
|
||||
if (np > 0) {
|
||||
GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO};
|
||||
for (; i < np; i += GGML_BF16_STEP) {
|
||||
GGML_BF16_VEC vx0 = GGML_BF16_VEC_LOAD(x + i);
|
||||
GGML_BF16_VEC vx1 = GGML_BF16_VEC_LOAD(x + i + 8);
|
||||
GGML_BF16_VEC vy0 = GGML_BF16_VEC_LOAD(y + i);
|
||||
GGML_BF16_VEC vy1 = GGML_BF16_VEC_LOAD(y + i + 8);
|
||||
GGML_BF16_FMA_LO(sum[0], vx0, vy0);
|
||||
GGML_BF16_FMA_HI(sum[1], vx0, vy0);
|
||||
GGML_BF16_FMA_LO(sum[2], vx1, vy1);
|
||||
GGML_BF16_FMA_HI(sum[3], vx1, vy1);
|
||||
}
|
||||
GGML_F32x4_REDUCE_4(sumf, sum[0], sum[1], sum[2], sum[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
||||
GGML_BF16_TO_FP32(y[i]));
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
|
|||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
|
||||
half2 tmp[cpy_ne];
|
||||
__align__(16) half2 tmp[cpy_ne];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
|
||||
|
|
@ -309,7 +309,7 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_
|
|||
ggml_cuda_memcpy_1<ne*sizeof(half)>(dst, (const half *) vx + i0);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
static_assert(ne % 2 == 0, "bad ne");
|
||||
half2 tmp[ne/2];
|
||||
__align__(16) half2 tmp[ne/2];
|
||||
ggml_cuda_memcpy_1<ne*sizeof(half)>(tmp, (const half *) vx + i0);
|
||||
float2 * dst_f2 = (float2 *) dst;
|
||||
#pragma unroll
|
||||
|
|
|
|||
|
|
@ -343,7 +343,7 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|||
for (int j0 = j0_start; j0 < j0_stop; j0 += stride_j) {
|
||||
const int j = j0*cpy_ne + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*cpy_ne;
|
||||
|
||||
const half2 zero[cpy_ne] = {{0.0f, 0.0f}};
|
||||
const __align__(16) half2 zero[cpy_ne] = {{0.0f, 0.0f}};
|
||||
ggml_cuda_memcpy_1<cpy_nb>(
|
||||
tile_KV + i*(J/2 + J_padding) + j,
|
||||
!oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
||||
|
|
@ -394,11 +394,11 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|||
const int j = j0*(cpy_ne/2) + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*(cpy_ne/2);
|
||||
|
||||
const half2 zero[cpy_ne/2] = {{0.0f, 0.0f}};
|
||||
half2 tmp_h2[cpy_ne/2];
|
||||
__align__(16) half2 tmp_h2[cpy_ne/2];
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
tmp_h2, !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
||||
|
||||
float2 tmp_f2[cpy_ne/2];
|
||||
__align__(16) float2 tmp_f2[cpy_ne/2];
|
||||
#pragma unroll
|
||||
for (int l = 0; l < cpy_ne/2; ++l) {
|
||||
tmp_f2[l] = __half22float2(tmp_h2[l]);
|
||||
|
|
@ -445,14 +445,14 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
|
|||
static_assert((nbatch_K/2) % cpy_ne == 0, "bad nbatch_K");
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < nbatch_K/2; k_KQ_1 += cpy_ne) {
|
||||
half2 K_k[nbatch_fa/(np*warp_size)][cpy_ne];
|
||||
half2 Q_k[cpw][cpy_ne];
|
||||
__align__(16) half2 K_k[nbatch_fa/(np*warp_size)][cpy_ne];
|
||||
__align__(16) half2 Q_k[cpw][cpy_ne];
|
||||
#else
|
||||
static_assert(nbatch_K % cpy_ne == 0, "bad nbatch_K");
|
||||
#pragma unroll
|
||||
for (int k_KQ_1 = 0; k_KQ_1 < nbatch_K; k_KQ_1 += cpy_ne) {
|
||||
float K_k[nbatch_fa/(np*warp_size)][cpy_ne];
|
||||
float Q_k[cpw][cpy_ne];
|
||||
__align__(16) float K_k[nbatch_fa/(np*warp_size)][cpy_ne];
|
||||
__align__(16) float Q_k[cpw][cpy_ne];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
|
|
@ -602,9 +602,9 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|||
#pragma unroll
|
||||
for (int jc0 = 0; jc0 < cpw; jc0 += KQ_cs) {
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half tmp[nbatch_fa/(np*warp_size)][KQ_cs];
|
||||
__align__(16) half tmp[nbatch_fa/(np*warp_size)][KQ_cs];
|
||||
#else
|
||||
float tmp[nbatch_fa/(np*warp_size)][KQ_cs];
|
||||
__align__(16) float tmp[nbatch_fa/(np*warp_size)][KQ_cs];
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
#pragma unroll
|
||||
|
|
@ -664,8 +664,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|||
#ifdef FAST_FP16_AVAILABLE
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < nbatch_V; k1 += np) {
|
||||
half2 V_k[(DVp/2)/warp_size];
|
||||
half2 KQ_k[cpw];
|
||||
__align__(16) half2 V_k[(DVp/2)/warp_size];
|
||||
__align__(16) half2 KQ_k[cpw];
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
|
||||
#pragma unroll
|
||||
|
|
@ -676,7 +676,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|||
for (int jc_VKQ_0 = 0; jc_VKQ_0 < cpw; jc_VKQ_0 += KQ_cs) {
|
||||
const int jc_KQ = jc_VKQ_0/KQ_cs + (threadIdx.y / np)*(cpw/KQ_cs);
|
||||
|
||||
half tmp[KQ_cs];
|
||||
__align__(16) half tmp[KQ_cs];
|
||||
ggml_cuda_memcpy_1<KQ_cs*sizeof(half)>(
|
||||
&tmp, KQ + jc_KQ*(nbatch_fa*KQ_cs) + (k0 + k1 + threadIdx.y % np)*KQ_cs);
|
||||
#pragma unroll
|
||||
|
|
@ -696,8 +696,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|||
#else
|
||||
#pragma unroll
|
||||
for (int k1 = 0; k1 < nbatch_V; k1 += np) {
|
||||
float2 V_k[(DVp/2)/warp_size];
|
||||
float KQ_k[cpw];
|
||||
__align__(16) float2 V_k[(DVp/2)/warp_size];
|
||||
__align__(16) float KQ_k[cpw];
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne < DVp/warp_size ? cpy_ne : DVp/warp_size;
|
||||
#pragma unroll
|
||||
|
|
@ -821,12 +821,12 @@ static __global__ void flash_attn_tile(
|
|||
__shared__ half2 Q_tmp[ncols * DKQ/2];
|
||||
__shared__ half2 KV_tmp[nbatch_fa * (nbatch_K/2 + cpy_ne) + DVp-DV];
|
||||
__shared__ half KQ[ncols * nbatch_fa];
|
||||
half2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
|
||||
__align__(16) half2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
|
||||
#else
|
||||
__shared__ float Q_tmp[ncols * DKQ];
|
||||
__shared__ float KV_tmp[nbatch_fa * (nbatch_K + cpy_ne) + DVp-DV];
|
||||
__shared__ float KQ[ncols * nbatch_fa];
|
||||
float2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
|
||||
__align__(16) float2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
float KQ_max[cpw];
|
||||
|
|
@ -849,7 +849,7 @@ static __global__ void flash_attn_tile(
|
|||
#pragma unroll
|
||||
for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
|
||||
if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
|
||||
float tmp_f[cpy_ne_D] = {0.0f};
|
||||
__align__(16) float tmp_f[cpy_ne_D] = {0.0f};
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_f)>
|
||||
(tmp_f, &Q_f[c*(nb02/sizeof(float)) + fastmodulo(col_Q_0 + j, ne01)*(nb01/sizeof(float))
|
||||
+ i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
|
||||
|
|
@ -860,7 +860,7 @@ static __global__ void flash_attn_tile(
|
|||
}
|
||||
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
half2 tmp_h2[cpy_ne_D/2];
|
||||
__align__(16) half2 tmp_h2[cpy_ne_D/2];
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
||||
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
||||
|
|
@ -959,7 +959,7 @@ static __global__ void flash_attn_tile(
|
|||
constexpr int cpy_ne_D = cpy_ne < (DVp/2)/warp_size ? cpy_ne : (DVp/2)/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < DVp/2; i0 += warp_size*cpy_ne_D) {
|
||||
half2 tmp[cpy_ne_D];
|
||||
__align__(16) half2 tmp[cpy_ne_D];
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(tmp, &VKQ_combine[(threadIdx.y + ip)*(DVp/2) + i0 + threadIdx.x*cpy_ne_D]);
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||
|
|
@ -970,7 +970,7 @@ static __global__ void flash_attn_tile(
|
|||
constexpr int cpy_ne_D = cpy_ne < DVp/warp_size ? cpy_ne : DVp/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < DVp; i0 += warp_size*cpy_ne_D) {
|
||||
float tmp[cpy_ne_D];
|
||||
__align__(16) float tmp[cpy_ne_D];
|
||||
ggml_cuda_memcpy_1<cpy_ne_D*4>(tmp, &VKQ_combine[(threadIdx.y + ip)*DVp + i0 + threadIdx.x*cpy_ne_D]);
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||
|
|
@ -1033,7 +1033,7 @@ static __global__ void flash_attn_tile(
|
|||
constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < DVp/2; i0 += warp_size*cpy_ne_D) {
|
||||
float2 tmp[cpy_ne_D];
|
||||
__align__(16) float2 tmp[cpy_ne_D];
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||
tmp[i1] = __half22float2(VKQ[jc0*((DVp/2)/warp_size) + i0/warp_size + i1]);
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ static __global__ void flash_attn_ext_vec(
|
|||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
||||
half2 Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
|
||||
#else
|
||||
float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
|
||||
__align__(16) float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
|
||||
#endif // V_DOT2_F32_F16_AVAILABLE
|
||||
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
|
||||
|
|
@ -200,7 +200,7 @@ static __global__ void flash_attn_ext_vec(
|
|||
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
|
||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
||||
|
||||
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
|
||||
__align__(16) float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
|
||||
if (ncols == 1 || ic0 + j < int(ne01.z)) {
|
||||
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
|
||||
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
|
||||
|
|
|
|||
|
|
@ -3742,8 +3742,10 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
|||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
||||
if (!cuda_ctx->cuda_graph->disable_due_to_gpu_arch) {
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
}
|
||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,9 @@ struct llama_sampler_deleter {
|
|||
};
|
||||
|
||||
struct llama_adapter_lora_deleter {
|
||||
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
|
||||
void operator()(llama_adapter_lora *) {
|
||||
// llama_adapter_lora_free is deprecated
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||
|
|
|
|||
|
|
@ -649,7 +649,8 @@ extern "C" {
|
|||
|
||||
// Manually free a LoRA adapter
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
||||
"adapters are now freed together with the associated model");
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
|
|
@ -1258,7 +1259,6 @@ extern "C" {
|
|||
// [EXPERIMENTAL]
|
||||
// attach a sampler to the context
|
||||
// note: prefer initializing the context with llama_context_params.samplers when possible
|
||||
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
||||
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
||||
|
||||
// mirror of llama_sampler_i:
|
||||
|
|
@ -1398,6 +1398,33 @@ extern "C" {
|
|||
const char ** seq_breakers,
|
||||
size_t num_breakers);
|
||||
|
||||
/// adaptive-p: select tokens near a configurable target probability over time.
|
||||
///
|
||||
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
|
||||
/// that fall near a user-configurable probability target.
|
||||
///
|
||||
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
|
||||
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
|
||||
/// adapted target probability at each sampling step, thus maintaining the desired target
|
||||
/// probability over time.
|
||||
///
|
||||
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
|
||||
/// in the sampler chain (like mirostat, dist, greedy).
|
||||
///
|
||||
/// only mild truncation before this sampler is recommended. we suggest applying min-p
|
||||
/// before adaptive-p as the only other active sampler in the chain.
|
||||
///
|
||||
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
||||
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
|
||||
/// @param seed RNG seed
|
||||
///
|
||||
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
||||
///
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
|
||||
float target,
|
||||
float decay,
|
||||
uint32_t seed);
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
int32_t n_vocab,
|
||||
int32_t n_logit_bias,
|
||||
|
|
|
|||
|
|
@ -146,11 +146,9 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
|
||||
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
|
||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||
|
||||
llama_model & model = adapter.model;
|
||||
|
||||
ggml_context * ctx_init;
|
||||
gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
|
|
@ -413,17 +411,17 @@ static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_l
|
|||
}
|
||||
}
|
||||
|
||||
// update number of nodes used
|
||||
model.n_lora_nodes += adapter.get_n_nodes();
|
||||
// register adapter with model
|
||||
model.loras.insert(&adapter);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||
}
|
||||
|
||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora(*model);
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(path_lora, *adapter);
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
return adapter;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
|
|
@ -473,12 +471,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
// update number of nodes used
|
||||
GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
|
||||
adapter->model.n_lora_nodes -= adapter->get_n_nodes();
|
||||
|
||||
delete adapter;
|
||||
void llama_adapter_lora_free(llama_adapter_lora *) {
|
||||
// deprecated: adapters are freed by llama_model's destructor
|
||||
}
|
||||
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||
|
|
|
|||
|
|
@ -59,8 +59,6 @@ struct llama_adapter_lora_weight {
|
|||
};
|
||||
|
||||
struct llama_adapter_lora {
|
||||
llama_model & model;
|
||||
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
|
||||
|
|
@ -75,7 +73,7 @@ struct llama_adapter_lora {
|
|||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora(llama_model & model) : model(model) {}
|
||||
llama_adapter_lora() = default;
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||
|
|
|
|||
|
|
@ -149,6 +149,7 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
|
||||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
|
|
@ -158,6 +159,9 @@ llama_context::llama_context(
|
|||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
// intialized later
|
||||
cparams.pipeline_parallel = false;
|
||||
|
||||
{
|
||||
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||
|
|
@ -305,16 +309,6 @@ llama_context::llama_context(
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
||||
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
||||
|
||||
// TODO: move these checks to ggml_backend_sched
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
|
|
@ -348,144 +342,19 @@ llama_context::llama_context(
|
|||
}
|
||||
}
|
||||
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
|
||||
cparams.pipeline_parallel = pipeline_parallel;
|
||||
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||
mctx = memory->init_full();
|
||||
if (!mctx) {
|
||||
throw std::runtime_error("failed to initialize memory module");
|
||||
sched_reserve();
|
||||
|
||||
if (!cparams.flash_attn) {
|
||||
if (ggml_is_quantized(params.type_v)) {
|
||||
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||
}
|
||||
}
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
// resolve automatic Flash Attention use
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
bool fa_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_fa != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||
fa_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
if (ggml_is_quantized(params.type_v)) {
|
||||
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
||||
}
|
||||
} else {
|
||||
cparams.flash_attn = true;
|
||||
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||
if (!gf) {
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
||||
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
}
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
}
|
||||
|
||||
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||
}
|
||||
|
||||
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_tg = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||
{
|
||||
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
||||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
if (!model.hparams.no_alloc) {
|
||||
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
if (backend_buf_exp_size[i] > 1) {
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_nodes_pp == n_nodes_tg) {
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
|
||||
}
|
||||
|
||||
if (n_splits_pp == n_splits_tg) {
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the full vocabulary token ids for backend samplers.
|
||||
|
|
@ -497,7 +366,6 @@ llama_context::llama_context(
|
|||
sampling.token_ids_full_vocab[i] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_context::~llama_context() {
|
||||
|
|
@ -520,7 +388,172 @@ llama_context::~llama_context() {
|
|||
ggml_opt_free(opt_ctx);
|
||||
}
|
||||
|
||||
void llama_context::sched_reserve() {
|
||||
if (!sched_need_reserve) {
|
||||
return;
|
||||
}
|
||||
|
||||
sched_need_reserve = false;
|
||||
|
||||
LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
|
||||
|
||||
synchronize();
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
||||
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
||||
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
|
||||
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
||||
mctx = memory->init_full();
|
||||
if (!mctx) {
|
||||
throw std::runtime_error("failed to initialize memory module");
|
||||
}
|
||||
}
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
const int n_outputs = n_seqs;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
// resolve automatic Flash Attention use
|
||||
if (cparams.auto_fa) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
bool fa_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_fa != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
||||
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
||||
fa_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
} else {
|
||||
cparams.flash_attn = true;
|
||||
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
||||
}
|
||||
|
||||
cparams.auto_fa = false;
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
||||
int n_splits_tg = -1;
|
||||
int n_nodes_tg = -1;
|
||||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||
if (!gf) {
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
cparams.pipeline_parallel = false;
|
||||
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
||||
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
}
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
}
|
||||
|
||||
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||
}
|
||||
|
||||
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_tg = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||
{
|
||||
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
||||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
if (!model.hparams.no_alloc) {
|
||||
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
if (backend_buf_exp_size[i] > 1) {
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_nodes_pp == n_nodes_tg) {
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
|
||||
}
|
||||
|
||||
if (n_splits_pp == n_splits_tg) {
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
|
||||
}
|
||||
|
||||
const int64_t t_end_us = ggml_time_us();
|
||||
|
||||
LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
|
||||
__func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
|
||||
}
|
||||
|
||||
void llama_context::synchronize() {
|
||||
if (!sched) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_backend_sched_synchronize(sched.get());
|
||||
|
||||
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
||||
|
|
@ -961,21 +994,41 @@ void llama_context::set_embeddings(bool value) {
|
|||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
cparams.embeddings = value;
|
||||
|
||||
// TODO: not sure yet if we want to reserve here
|
||||
//sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
if (cparams.causal_attn == value) {
|
||||
return;
|
||||
}
|
||||
|
||||
cparams.causal_attn = value;
|
||||
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_warmup(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
if (cparams.warmup == value) {
|
||||
return;
|
||||
}
|
||||
|
||||
cparams.warmup = value;
|
||||
|
||||
// warmups are usually with small batches, so no need to reserve
|
||||
//sched_need_reserve = true;
|
||||
}
|
||||
|
||||
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
||||
if (!sampler && sampling.samplers.count(seq_id) == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
||||
|
||||
const bool can_offload =
|
||||
|
|
@ -995,12 +1048,18 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|||
|
||||
sampling.samplers[seq_id] = sampler;
|
||||
|
||||
sched_need_reserve = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (sampler && !can_offload) {
|
||||
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
|
||||
|
||||
if (sampling.samplers.count(seq_id) > 0) {
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
sampling.samplers.erase(seq_id);
|
||||
|
||||
return false;
|
||||
|
|
@ -1008,6 +1067,8 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|||
|
||||
sampling.samplers.erase(seq_id);
|
||||
|
||||
sched_need_reserve = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1016,16 +1077,27 @@ void llama_context::set_adapter_lora(
|
|||
float scale) {
|
||||
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
|
||||
|
||||
if (auto it = loras.find(adapter); it != loras.end()) {
|
||||
if (it->second == scale) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
loras[adapter] = scale;
|
||||
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
bool llama_context::rm_adapter_lora(
|
||||
llama_adapter_lora * adapter) {
|
||||
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
|
||||
|
||||
auto pos = loras.find(adapter);
|
||||
if (pos != loras.end()) {
|
||||
loras.erase(pos);
|
||||
auto it = loras.find(adapter);
|
||||
if (it != loras.end()) {
|
||||
loras.erase(it);
|
||||
|
||||
sched_need_reserve = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1035,7 +1107,13 @@ bool llama_context::rm_adapter_lora(
|
|||
void llama_context::clear_adapter_lora() {
|
||||
LLAMA_LOG_DEBUG("%s: call\n", __func__);
|
||||
|
||||
if (loras.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
loras.clear();
|
||||
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
bool llama_context::apply_adapter_cvec(
|
||||
|
|
@ -1046,6 +1124,8 @@ bool llama_context::apply_adapter_cvec(
|
|||
int32_t il_end) {
|
||||
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
|
||||
|
||||
// TODO: should we reserve?
|
||||
|
||||
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
||||
}
|
||||
|
||||
|
|
@ -1148,6 +1228,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
|
||||
sched_reserve();
|
||||
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
// reserve output buffer
|
||||
|
|
@ -1187,7 +1269,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
||||
|
||||
// extract logits
|
||||
if (logits && t_logits) {
|
||||
if (logits && t_logits) {
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
||||
GGML_ASSERT(backend_res != nullptr);
|
||||
GGML_ASSERT(logits != nullptr);
|
||||
|
|
@ -1461,6 +1543,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
embd_seq.clear();
|
||||
output_swaps.clear();
|
||||
|
||||
sched_reserve();
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
// handle any pending shifts/copies
|
||||
|
|
@ -1965,7 +2049,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
|||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
res += model.n_lora_nodes;
|
||||
for (const auto & lora : model.loras) {
|
||||
res += lora->get_n_nodes();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,14 @@ struct llama_context {
|
|||
|
||||
~llama_context();
|
||||
|
||||
// reserve a new backend scheduler (if needed)
|
||||
// for example, when:
|
||||
// - changing loras
|
||||
// - changing samplers
|
||||
// - changing attention type
|
||||
// - etc.
|
||||
void sched_reserve();
|
||||
|
||||
void synchronize();
|
||||
|
||||
const llama_model & get_model() const;
|
||||
|
|
@ -314,6 +322,8 @@ private:
|
|||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
bool sched_need_reserve = true;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,10 +30,12 @@ struct llama_cparams {
|
|||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool auto_fa;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
|
|
|||
|
|
@ -578,7 +578,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|||
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||
}
|
||||
|
||||
llama_model::~llama_model() = default;
|
||||
llama_model::~llama_model() {
|
||||
for (auto * lora : loras) {
|
||||
delete lora;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model::load_stats(llama_model_loader & ml) {
|
||||
pimpl->n_elements = ml.n_elements;
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
struct llama_cparams;
|
||||
|
|
@ -476,8 +477,8 @@ struct llama_model {
|
|||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
// for keeping track of extra nodes used by lora adapters
|
||||
uint32_t n_lora_nodes = 0;
|
||||
// for keeping track of associated LoRA adapters
|
||||
std::unordered_set<llama_adapter_lora *> loras;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
|
|
|||
|
|
@ -1513,12 +1513,9 @@ static void llama_sampler_top_p_backend_apply(
|
|||
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
||||
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
||||
|
||||
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
||||
// top_p_bias = (mask * 1e9f) - 1e9f.
|
||||
// So entries in the mask that we want to discard will become -1e9f, and
|
||||
// others will be 0 (meaning that will not effect the logits).
|
||||
const float large_val = 1e9f;
|
||||
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
||||
// Apply -INFINITY bias for masked-out tokens
|
||||
// log(1) = 0 (keep), log(0) = -INF (discard)
|
||||
struct ggml_tensor * top_p_bias = ggml_log(ctx, mask);
|
||||
ggml_set_name(top_p_bias, "top_p_bias");
|
||||
|
||||
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
||||
|
|
@ -1673,15 +1670,11 @@ static void llama_sampler_min_p_backend_apply(
|
|||
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
||||
ggml_set_name(mask, "min_p_mask");
|
||||
|
||||
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
||||
// min_p_bias = (mask * 1e9f) - 1e9f.
|
||||
// So entries in the mask that we want to discard will become -1e9f, and
|
||||
// others will be 0 (meaning that will not effect the logits).
|
||||
const float large_val = 1e9f;
|
||||
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
||||
// Apply -INFINITY bias for masked-out tokens
|
||||
// log(1) = 0 (keep), log(0) = -INF (discard)
|
||||
struct ggml_tensor * min_p_bias = ggml_log(ctx, mask);
|
||||
ggml_set_name(min_p_bias, "min_p_bias");
|
||||
|
||||
// Add the min_p bias to the logits.
|
||||
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
||||
ggml_set_name(data->logits, "min_p_logits");
|
||||
|
||||
|
|
@ -3293,6 +3286,170 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|||
return result;
|
||||
}
|
||||
|
||||
// adaptive-p sampler state
|
||||
//
|
||||
// maintains an exponential moving average of the *ORIGINAL* probabilities
|
||||
// of selected tokens, used to compute an adapted target at each sampling step.
|
||||
//
|
||||
// see llama.h for a full description of the sampler
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
||||
//
|
||||
struct llama_sampler_adaptive_p {
|
||||
const float target; // target probability (0.0 - 1.0; negative = disabled)
|
||||
const float decay; // EMA decay; history ~= 1/(1-decay) tokens (0.0 - 0.99)
|
||||
const uint32_t seed; // original RNG seed
|
||||
uint32_t seed_cur; // actual RNG seed
|
||||
std::mt19937 rng; // RNG state
|
||||
float weighted_sum; // sum(p_i * decay^i)
|
||||
float total_weight; // sum(decay^i), converges to 1/(1-decay)
|
||||
std::vector<float> original_probs; // pre-transform probs, cached for EMA update
|
||||
llama_token pending_token_id; // token ID of selected token
|
||||
int32_t pending_token_idx; // index of orig. prob. of selected token in original_probs
|
||||
};
|
||||
|
||||
// adaptive probability transformation constants
|
||||
static constexpr float DISTRIBUTION_WIDTH = 0.3f;
|
||||
static constexpr float PEAK_LOGIT_VALUE = 5.0f;
|
||||
static constexpr float SHARPNESS = 10.0f;
|
||||
static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
|
||||
|
||||
static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
|
||||
return "adaptive-p";
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
|
||||
if (ctx->target < 0.0f) {
|
||||
// at negative target values, adaptive-p is no-op
|
||||
// we simply sample from the existing distribution
|
||||
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
||||
return;
|
||||
}
|
||||
|
||||
// store the original probabilities
|
||||
ctx->original_probs.resize(cur_p->size);
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
ctx->original_probs[i] = cur_p->data[i].p;
|
||||
}
|
||||
|
||||
// using the EMA, compute the adapted target probability for the current sampling step
|
||||
auto target = std::clamp(ctx->target, 0.0f, 1.0f);
|
||||
float adapted_target = std::clamp(
|
||||
ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
|
||||
0.0f, 1.0f
|
||||
);
|
||||
|
||||
// adaptive probability transform
|
||||
//
|
||||
// quadratic near target for fine differentiation, transitioning to linear decay in the
|
||||
// tails. unbounded negative logits ensure proper suppression of far-from-target tokens
|
||||
// after the softmax.
|
||||
//
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit == -INFINITY) {
|
||||
// don't transform logits that are -INFINITY
|
||||
// (as masked out by e.g. min-p and top-p when using backend sampling)
|
||||
continue;
|
||||
}
|
||||
float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
|
||||
cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
|
||||
}
|
||||
|
||||
// softmax and sample from the transformed distribution
|
||||
llama_sampler_softmax_impl(cur_p, false);
|
||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||
cur_p->selected = idx;
|
||||
|
||||
// store the selected token ID for acceptance later
|
||||
ctx->pending_token_id = cur_p->data[idx].id;
|
||||
ctx->pending_token_idx = idx;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_accept(struct llama_sampler * smpl, llama_token token) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
if (ctx->pending_token_id == token) {
|
||||
GGML_ASSERT(ctx->pending_token_id != LLAMA_TOKEN_NULL);
|
||||
GGML_ASSERT(ctx->pending_token_idx != -1);
|
||||
// update EMA with the original probability of the selected token
|
||||
ctx->weighted_sum = ctx->original_probs[ctx->pending_token_idx] + ctx->decay * ctx->weighted_sum;
|
||||
ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
|
||||
}
|
||||
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
||||
ctx->pending_token_idx = -1;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
// ctx->target and ctx->decay never change after init, so it's safe to keep them as is.
|
||||
// original_probs is completely overwritten on every call to _apply.
|
||||
// so we only need to reset the EMA state and pending token.
|
||||
ctx->weighted_sum = ctx->target / (1.0f - ctx->decay);
|
||||
ctx->total_weight = 1.0f / (1.0f - ctx->decay);
|
||||
ctx->pending_token_id = LLAMA_TOKEN_NULL;
|
||||
ctx->pending_token_idx = -1;
|
||||
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||
ctx->rng.seed(ctx->seed_cur);
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
|
||||
auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
|
||||
auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
|
||||
|
||||
// copy everything (target, decay, seed, and RNG are already set)
|
||||
result_ctx->weighted_sum = ctx->weighted_sum;
|
||||
result_ctx->total_weight = ctx->total_weight;
|
||||
result_ctx->pending_token_id = ctx->pending_token_id;
|
||||
result_ctx->pending_token_idx = ctx->pending_token_idx;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
|
||||
delete (llama_sampler_adaptive_p *) smpl->ctx;
|
||||
}
|
||||
|
||||
static struct llama_sampler_i llama_sampler_adaptive_p_i = {
|
||||
/* .name = */ llama_sampler_adaptive_p_name,
|
||||
/* .accept = */ llama_sampler_adaptive_p_accept,
|
||||
/* .apply = */ llama_sampler_adaptive_p_apply,
|
||||
/* .reset = */ llama_sampler_adaptive_p_reset,
|
||||
/* .clone = */ llama_sampler_adaptive_p_clone,
|
||||
/* .free = */ llama_sampler_adaptive_p_free,
|
||||
/* .backend_init = */ nullptr,
|
||||
/* .backend_accept = */ nullptr,
|
||||
/* .backend_apply = */ nullptr,
|
||||
/* .backend_set_input = */ nullptr,
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_adaptive_p(
|
||||
float target,
|
||||
float decay,
|
||||
uint32_t seed
|
||||
) {
|
||||
auto seed_cur = get_rng_seed(seed);
|
||||
float clamped_decay = std::clamp(decay, 0.0f, 0.99f);
|
||||
return llama_sampler_init(
|
||||
/* .iface = */ &llama_sampler_adaptive_p_i,
|
||||
/* .ctx = */ new llama_sampler_adaptive_p {
|
||||
/* .target = */ target,
|
||||
/* .decay = */ clamped_decay,
|
||||
/* .seed = */ seed,
|
||||
/* .seed_cur = */ seed_cur,
|
||||
/* .rng = */ std::mt19937(seed_cur),
|
||||
/* .weighted_sum = */ target / (1.0f - clamped_decay),
|
||||
/* .total_weight = */ 1.0f / (1.0f - clamped_decay),
|
||||
/* .original_probs = */ {},
|
||||
/* .pending_token_id = */ LLAMA_TOKEN_NULL,
|
||||
/* .pending_token_idx = */ -1
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// logit-bias
|
||||
|
||||
struct llama_sampler_logit_bias : public llama_sampler_backend {
|
||||
|
|
|
|||
|
|
@ -45,26 +45,6 @@ enum server_state {
|
|||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
};
|
||||
|
||||
static bool server_task_type_need_embd(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
|
||||
|
|
@ -147,6 +127,17 @@ struct server_slot {
|
|||
return res;
|
||||
}
|
||||
|
||||
void prompt_clear(bool allow_processing) {
|
||||
if (!allow_processing) {
|
||||
GGML_ASSERT(!is_processing());
|
||||
}
|
||||
|
||||
SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
|
||||
prompt.tokens.clear();
|
||||
}
|
||||
|
||||
std::vector<common_adapter_lora_info> lora;
|
||||
int32_t alora_invocation_start = -1;
|
||||
|
||||
|
|
@ -167,7 +158,7 @@ struct server_slot {
|
|||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
||||
std::function<void(int)> callback_on_release;
|
||||
std::function<void(int /* slot_id */)> callback_on_release;
|
||||
|
||||
// Speculative decoding stats
|
||||
int32_t n_draft_total = 0; // Total draft tokens generated
|
||||
|
|
@ -196,30 +187,24 @@ struct server_slot {
|
|||
n_draft_total = 0;
|
||||
n_draft_accepted = 0;
|
||||
|
||||
task_prev = std::move(task);
|
||||
task.reset();
|
||||
task_prev.reset();
|
||||
|
||||
llama_set_sampler(ctx, id, nullptr);
|
||||
|
||||
// clear alora start
|
||||
alora_invocation_start = -1;
|
||||
}
|
||||
|
||||
// remove cached prompt + tokens
|
||||
void clear(bool allow_processing) {
|
||||
if (!allow_processing) {
|
||||
GGML_ASSERT(!is_processing());
|
||||
void init_sampler() const {
|
||||
common_sampler_reset(smpl.get());
|
||||
|
||||
if (!task->need_sampling()) {
|
||||
return;
|
||||
}
|
||||
|
||||
SLT_INF(*this, "clearing slot with %zu tokens\n", prompt.tokens.size());
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
|
||||
prompt.tokens.clear();
|
||||
}
|
||||
|
||||
void init_sampler() const {
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
common_sampler_reset(smpl.get());
|
||||
|
||||
int n_text = 0;
|
||||
|
||||
for (int i = 0; i < (int) prompt.tokens.size(); i++) {
|
||||
|
|
@ -235,25 +220,13 @@ struct server_slot {
|
|||
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
|
||||
}
|
||||
|
||||
// TODO: move to server_task
|
||||
bool need_embd() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
return server_task_type_need_embd(task->type);
|
||||
}
|
||||
|
||||
// TODO: move to server_task
|
||||
bool need_logits() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
return server_task_type_need_logits(task->type);
|
||||
}
|
||||
|
||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||
// also we cannot split if the pooling would require any past tokens
|
||||
bool can_split() const {
|
||||
GGML_ASSERT(task);
|
||||
|
||||
return
|
||||
!need_embd() ||
|
||||
!task->need_embd() ||
|
||||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
||||
}
|
||||
|
||||
|
|
@ -325,17 +298,6 @@ struct server_slot {
|
|||
return n_draft_max;
|
||||
}
|
||||
|
||||
// note: a slot can also be either a parent or a child
|
||||
// TODO: move to server_task
|
||||
bool is_parent() const {
|
||||
return task->n_children > 0;
|
||||
}
|
||||
|
||||
// TODO: move to server_task
|
||||
bool is_child() const {
|
||||
return task->id_parent >= 0;
|
||||
}
|
||||
|
||||
void release() {
|
||||
if (is_processing()) {
|
||||
GGML_ASSERT(task);
|
||||
|
|
@ -348,12 +310,11 @@ struct server_slot {
|
|||
state = SLOT_STATE_IDLE;
|
||||
|
||||
// do not keep context of the child slots - the parent's context is enough
|
||||
if (is_child()) {
|
||||
clear(false);
|
||||
if (task->is_child()) {
|
||||
prompt_clear(false);
|
||||
}
|
||||
|
||||
task_prev = std::move(task);
|
||||
task.reset();
|
||||
reset();
|
||||
|
||||
callback_on_release(id);
|
||||
}
|
||||
|
|
@ -801,6 +762,7 @@ private:
|
|||
|
||||
slots.clear();
|
||||
|
||||
// initialize slots
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot slot;
|
||||
|
||||
|
|
@ -832,8 +794,8 @@ private:
|
|||
|
||||
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
slot.callback_on_release = [this](int slot_id) {
|
||||
queue_tasks.pop_deferred_task(slot_id);
|
||||
};
|
||||
|
||||
slot.reset();
|
||||
|
|
@ -947,9 +909,9 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
server_slot * get_slot_by_id(int id) {
|
||||
server_slot * get_slot_by_id(int id_slot) {
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.id == id) {
|
||||
if (slot.id == id_slot) {
|
||||
return &slot;
|
||||
}
|
||||
}
|
||||
|
|
@ -1049,7 +1011,7 @@ private:
|
|||
ret->prompt_save(*prompt_cache);
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->clear(false);
|
||||
ret->prompt_clear(false);
|
||||
}
|
||||
|
||||
prompt_cache->update();
|
||||
|
|
@ -1081,7 +1043,7 @@ private:
|
|||
if (slot.prompt.n_tokens() > 0) {
|
||||
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
|
||||
|
||||
slot.clear(false);
|
||||
slot.prompt_clear(false);
|
||||
|
||||
res = true;
|
||||
|
||||
|
|
@ -1107,8 +1069,6 @@ private:
|
|||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, server_task && task) {
|
||||
slot.reset();
|
||||
|
||||
// process per-request lora adapters
|
||||
if (!task.params.lora.empty()) {
|
||||
auto task_loras = construct_lora_list(task.params.lora);
|
||||
|
|
@ -1182,7 +1142,7 @@ private:
|
|||
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
||||
|
||||
// initialize samplers
|
||||
{
|
||||
if (task.need_sampling()) {
|
||||
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
|
||||
|
||||
if (slot.smpl == nullptr) {
|
||||
|
|
@ -1211,6 +1171,8 @@ private:
|
|||
}
|
||||
|
||||
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
} else {
|
||||
slot.smpl.reset();
|
||||
}
|
||||
|
||||
// initialize draft batch
|
||||
|
|
@ -1223,12 +1185,11 @@ private:
|
|||
|
||||
slot.task = std::make_unique<const server_task>(std::move(task));
|
||||
|
||||
slot.state = slot.is_child()
|
||||
slot.state = slot.task->is_child()
|
||||
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
|
||||
: SLOT_STATE_STARTED;
|
||||
|
||||
SLT_INF(slot, "processing task, is_child = %d\n", slot.is_child());
|
||||
|
||||
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1623,9 +1584,7 @@ private:
|
|||
|
||||
// tokenize the input if it's set by CLI, return false on error
|
||||
bool tokenize_cli_input(server_task & task) {
|
||||
if (task.cli_input == nullptr) {
|
||||
return true; // nothing to do
|
||||
}
|
||||
GGML_ASSERT(task.cli_input != nullptr);
|
||||
try {
|
||||
auto & opt = oai_parser_opt;
|
||||
common_chat_templates_inputs inputs;
|
||||
|
|
@ -1659,6 +1618,64 @@ private:
|
|||
return true;
|
||||
}
|
||||
|
||||
std::vector<server_slot *> get_free_slots(size_t n_slots_needed, int exclude_id_slot) {
|
||||
std::vector<server_slot *> free_slots;
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() && slot.id != exclude_id_slot) {
|
||||
free_slots.push_back(&slot);
|
||||
}
|
||||
if (free_slots.size() >= n_slots_needed) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return free_slots;
|
||||
}
|
||||
|
||||
// launch multiple slots for parent + child tasks
|
||||
bool launch_slots_with_parent_task(server_slot & parent_slot, std::vector<server_slot *> & child_slots, server_task && parent_task) {
|
||||
GGML_ASSERT(!parent_slot.is_processing());
|
||||
GGML_ASSERT(parent_task.is_parent());
|
||||
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
||||
|
||||
int id_parent = parent_task.id;
|
||||
|
||||
SRV_INF("launching slots for parent task id_task = %d with %zu child tasks\n", id_parent, parent_task.child_tasks.size());
|
||||
|
||||
// to be called in case of failure to release all launched slots
|
||||
auto release_slots = [this, id_parent]() {
|
||||
for (auto & slot : slots) {
|
||||
if (slot.is_processing() && (
|
||||
slot.task->id == id_parent ||
|
||||
slot.task->id_parent == id_parent
|
||||
)) {
|
||||
slot.release();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// launch all child tasks first
|
||||
size_t idx = 0;
|
||||
GGML_ASSERT(child_slots.size() == parent_task.child_tasks.size());
|
||||
for (auto * slot : child_slots) {
|
||||
int id_child = parent_task.child_tasks[idx].id;
|
||||
if (!launch_slot_with_task(*slot, std::move(parent_task.child_tasks[idx]))) {
|
||||
SRV_ERR("failed to launch slot with child task, id_task = %d\n", id_child);
|
||||
release_slots();
|
||||
return false;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
// finally, launch the parent task
|
||||
if (!launch_slot_with_task(parent_slot, std::move(parent_task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_parent);
|
||||
release_slots();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
|
|
@ -1666,31 +1683,55 @@ private:
|
|||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
{
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
// special case: if input is provided via CLI, tokenize it first
|
||||
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
||||
if (task.cli_input != nullptr) {
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const int id_slot = task.id_slot;
|
||||
const int id_task = task.id;
|
||||
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
server_slot * slot = id_slot != -1
|
||||
? get_slot_by_id(id_slot)
|
||||
: get_available_slot(task);
|
||||
|
||||
//
|
||||
// slot scheduling logic
|
||||
//
|
||||
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
|
||||
SRV_DBG("no slot is available, defer task, id_task = %d\n", id_task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", id_task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
|
||||
break;
|
||||
if (task.is_parent()) {
|
||||
// try getting free slots for all child tasks
|
||||
size_t n_child_tasks = task.child_tasks.size();
|
||||
std::vector<server_slot *> child_slots = get_free_slots(n_child_tasks, slot->id);
|
||||
if (child_slots.size() < n_child_tasks) {
|
||||
SRV_DBG("not enough free slots for child tasks, n_free = %zu, n_children = %zu, defer task, id_task = %d\n", child_slots.size(), n_child_tasks, id_task);
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
if (!launch_slots_with_parent_task(*slot, child_slots, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with parent task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
} else if (!launch_slot_with_task(*slot, std::move(task))) {
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_CANCEL:
|
||||
|
|
@ -1864,7 +1905,7 @@ private:
|
|||
// Erase token cache
|
||||
const size_t n_erased = slot->prompt.tokens.size();
|
||||
|
||||
slot->clear(false);
|
||||
slot->prompt_clear(false);
|
||||
|
||||
auto res = std::make_unique<server_task_result_slot_erase>();
|
||||
res->id = task.id;
|
||||
|
|
@ -1959,7 +2000,7 @@ private:
|
|||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
if (slot.is_parent() || slot.is_child()) {
|
||||
if (slot.task->is_parent() || slot.task->is_child()) {
|
||||
send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER);
|
||||
slot.release();
|
||||
continue;
|
||||
|
|
@ -2106,21 +2147,6 @@ private:
|
|||
|
||||
// this slot still has a prompt to be processed
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
||||
// wait for all children to be launched
|
||||
if (slot.is_parent()) {
|
||||
int n_launched = 0;
|
||||
for (auto & other : slots) {
|
||||
if (other.is_processing() && other.is_child() && other.task->id_parent == slot.task->id) {
|
||||
++n_launched;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_launched < slot.task->n_children) {
|
||||
SLT_DBG(slot, "waiting for children to be launched, n_children = %d, n_launched = %d\n", slot.task->n_children, n_launched);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & input_tokens = slot.task->tokens;
|
||||
|
||||
// TODO: maybe move branch to outside of this loop in the future
|
||||
|
|
@ -2161,7 +2187,7 @@ private:
|
|||
}
|
||||
|
||||
// TODO: support memory-less logits computation
|
||||
if (slot.need_logits() && !llama_get_memory(ctx)) {
|
||||
if (slot.task->need_logits() && !llama_get_memory(ctx)) {
|
||||
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
|
||||
slot.release();
|
||||
continue;
|
||||
|
|
@ -2421,7 +2447,7 @@ private:
|
|||
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
|
||||
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
|
||||
|
||||
slot.clear(true);
|
||||
slot.prompt_clear(true);
|
||||
|
||||
// there is no common part left
|
||||
slot.n_prompt_tokens_cache = 0;
|
||||
|
|
@ -2500,7 +2526,7 @@ private:
|
|||
cur_tok,
|
||||
slot.prompt.tokens.pos_next(),
|
||||
{ slot.id },
|
||||
slot.need_embd());
|
||||
slot.task->need_embd());
|
||||
slot.prompt.tokens.push_back(cur_tok);
|
||||
|
||||
slot.n_prompt_tokens_processed++;
|
||||
|
|
@ -2590,7 +2616,7 @@ private:
|
|||
slot_batched->lora[alora_disabled_id].scale = alora_scale;
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
llama_set_embeddings(ctx, slot_batched->task->need_embd());
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
|
|
@ -2648,7 +2674,7 @@ private:
|
|||
|
||||
// note: it's complicated to keep track of how much of the current batch has been
|
||||
// processed before the error occurred, so we simply clear the entire context
|
||||
slot.clear(false);
|
||||
slot.prompt_clear(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2674,9 +2700,7 @@ private:
|
|||
|
||||
// handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too
|
||||
for (auto & slot : slots) {
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
|
||||
SLT_INF(slot, "parent task prompt done, n_children = %d\n", slot.task->n_children);
|
||||
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) {
|
||||
std::vector<server_slot *> children;
|
||||
for (auto & other : slots) {
|
||||
if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) {
|
||||
|
|
@ -2684,17 +2708,15 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
// we can only proceed if all child slots are having the correct tasks
|
||||
if (slot.task->n_children == (int) children.size()) {
|
||||
// copy state to the child slots
|
||||
for (auto & child : children) {
|
||||
SLT_INF(slot, " - copying state to child %d\n", child->id);
|
||||
// all children slots should already launched by launch_slots_with_parent_task()
|
||||
// copy state to the child slots
|
||||
for (auto & child : children) {
|
||||
SLT_INF(slot, " - copying state to child %d\n", child->id);
|
||||
|
||||
GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
|
||||
GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER);
|
||||
|
||||
slot.copy_state_to(*child);
|
||||
child->state = SLOT_STATE_DONE_PROMPT;
|
||||
}
|
||||
slot.copy_state_to(*child);
|
||||
child->state = SLOT_STATE_DONE_PROMPT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2727,6 +2749,8 @@ private:
|
|||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
GGML_ASSERT(slot.task->need_sampling());
|
||||
|
||||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
|
|
@ -2968,7 +2992,9 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
// Everything else, including multimodal completions.
|
||||
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
tasks.reserve(inputs.size());
|
||||
|
||||
// tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks
|
||||
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
server_task task = server_task(type);
|
||||
|
||||
|
|
@ -2989,23 +3015,13 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
|||
|
||||
// prepare child tasks
|
||||
if (task.params.n_cmpl > 1) {
|
||||
task.n_children = task.params.n_cmpl - 1;
|
||||
|
||||
for (int j = 0; j < task.n_children; j++) {
|
||||
server_task child = task.create_child(task.id, rd.get_new_id());
|
||||
|
||||
// use different sampling seed for each child
|
||||
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
|
||||
if (child.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
||||
child.params.sampling.seed += j + 1;
|
||||
}
|
||||
|
||||
tasks.push_back(std::move(child));
|
||||
int n_children = task.params.n_cmpl - 1;
|
||||
for (int j = 0; j < n_children; j++) {
|
||||
task.add_child(task.id, rd.get_new_id());
|
||||
}
|
||||
}
|
||||
|
||||
// note: the parent task always launches first
|
||||
tasks.insert(tasks.begin(), std::move(task));
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
rd.post_tasks(std::move(tasks));
|
||||
|
|
|
|||
|
|
@ -74,11 +74,26 @@ int server_queue::get_new_id() {
|
|||
return new_id;
|
||||
}
|
||||
|
||||
void server_queue::pop_deferred_task() {
|
||||
void server_queue::pop_deferred_task(int id_slot) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (!queue_tasks_deferred.empty()) {
|
||||
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
// try to find a task that uses the specified slot
|
||||
bool found = false;
|
||||
for (auto it = queue_tasks_deferred.begin(); it != queue_tasks_deferred.end(); ++it) {
|
||||
if (it->id_slot == id_slot) {
|
||||
QUE_DBG("pop deferred task (use slot %d), id_task = %d\n", id_slot, it->id);
|
||||
queue_tasks.emplace_front(std::move(*it));
|
||||
queue_tasks_deferred.erase(it);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// if not tasks found using the slot, just pop the first deferred task (default behavior)
|
||||
if (!found) {
|
||||
QUE_DBG("pop deferred task, id_task = %d\n", queue_tasks_deferred.front().id);
|
||||
queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
|
||||
queue_tasks_deferred.pop_front();
|
||||
}
|
||||
}
|
||||
time_last_task = ggml_time_ms();
|
||||
condition_tasks.notify_one();
|
||||
|
|
@ -217,12 +232,12 @@ void server_response::add_waiting_task_id(int id_task) {
|
|||
waiting_task_ids.insert(id_task);
|
||||
}
|
||||
|
||||
void server_response::add_waiting_tasks(const std::vector<server_task> & tasks) {
|
||||
void server_response::add_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (const auto & task : tasks) {
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.insert(task.id);
|
||||
for (const auto & id_task : id_tasks) {
|
||||
RES_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
|
||||
waiting_task_ids.insert(id_task);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -327,6 +342,7 @@ void server_response::terminate() {
|
|||
|
||||
void server_response_reader::post_task(server_task && task, bool front) {
|
||||
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
|
||||
GGML_ASSERT(!task.is_parent() && "not supported, use post_tasks() instead");
|
||||
task.index = 0;
|
||||
id_tasks.insert(task.id);
|
||||
states.push_back(task.create_state());
|
||||
|
|
@ -338,11 +354,18 @@ void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool
|
|||
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
|
||||
id_tasks = server_task::get_list_id(tasks);
|
||||
states.reserve(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
tasks[i].index = i;
|
||||
states.push_back(tasks[i].create_state());
|
||||
size_t index = 0;
|
||||
for (auto & task : tasks) {
|
||||
task.index = index++;
|
||||
states.push_back(task.create_state());
|
||||
// for child tasks
|
||||
for (auto & child_task : task.child_tasks) {
|
||||
child_task.index = index++;
|
||||
states.push_back(child_task.create_state());
|
||||
}
|
||||
}
|
||||
queue_results.add_waiting_tasks(tasks);
|
||||
GGML_ASSERT(states.size() == id_tasks.size());
|
||||
queue_results.add_waiting_task_ids(id_tasks);
|
||||
queue_tasks.post(std::move(tasks), front);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -44,7 +44,8 @@ public:
|
|||
int get_new_id();
|
||||
|
||||
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
||||
void pop_deferred_task();
|
||||
// prioritize tasks that use the specified slot (otherwise, pop the first deferred task)
|
||||
void pop_deferred_task(int id_slot);
|
||||
|
||||
// if sleeping, request exiting sleep state and wait until it is done
|
||||
// returns immediately if not sleeping
|
||||
|
|
@ -124,7 +125,7 @@ public:
|
|||
// add the id_task to the list of tasks waiting for response
|
||||
void add_waiting_task_id(int id_task);
|
||||
|
||||
void add_waiting_tasks(const std::vector<server_task> & tasks);
|
||||
void add_waiting_task_ids(const std::unordered_set<int> & id_tasks);
|
||||
|
||||
// when the request is finished, we can remove task associated with it
|
||||
void remove_waiting_task_id(int id_task);
|
||||
|
|
|
|||
|
|
@ -204,6 +204,8 @@ task_params server_task::params_from_json_cmpl(
|
|||
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
|
||||
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
|
||||
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
|
||||
params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target);
|
||||
params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay);
|
||||
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
|
||||
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
|
||||
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
|
||||
|
|
|
|||
|
|
@ -121,8 +121,10 @@ struct server_task {
|
|||
int id_slot = -1;
|
||||
|
||||
// used by parallel sampling (multiple completions from same prompt)
|
||||
int n_children = 0; // number of tasks reusing this prompt
|
||||
int id_parent = -1;
|
||||
// temporary store of child tasks for scheduling
|
||||
// note: accessing to elements is invalid after the task is moved to server_slot
|
||||
std::vector<server_task> child_tasks;
|
||||
|
||||
// used by SERVER_TASK_TYPE_INFERENCE
|
||||
task_params params;
|
||||
|
|
@ -156,6 +158,36 @@ struct server_task {
|
|||
return tokens.size();
|
||||
}
|
||||
|
||||
bool need_embd() const {
|
||||
switch (type) {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool need_logits() const {
|
||||
switch (type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool need_sampling() const {
|
||||
switch (type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static task_params params_from_json_cmpl(
|
||||
const llama_vocab * vocab,
|
||||
const common_params & params_base,
|
||||
|
|
@ -167,11 +199,14 @@ struct server_task {
|
|||
std::unordered_set<int> ids(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
ids.insert(tasks[i].id);
|
||||
for (auto & child : tasks[i].child_tasks) {
|
||||
ids.insert(child.id);
|
||||
}
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
server_task create_child(int id_parent, int id_child) const {
|
||||
void add_child(int id_parent, int id_child) {
|
||||
server_task copy;
|
||||
|
||||
copy.id = id_child;
|
||||
|
|
@ -179,8 +214,15 @@ struct server_task {
|
|||
copy.params = params;
|
||||
copy.type = type;
|
||||
copy.tokens = tokens.clone();
|
||||
copy.id_slot = -1; // child tasks cannot specify slot
|
||||
|
||||
return copy;
|
||||
// use different sampling seed for each child
|
||||
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
|
||||
if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
||||
copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
|
||||
}
|
||||
|
||||
child_tasks.push_back(std::move(copy));
|
||||
}
|
||||
|
||||
// the task will be moved into queue, then onto slots
|
||||
|
|
@ -188,6 +230,14 @@ struct server_task {
|
|||
task_result_state create_state() const {
|
||||
return task_result_state(params.oaicompat_chat_syntax);
|
||||
}
|
||||
|
||||
bool is_parent() const {
|
||||
return child_tasks.size() > 0;
|
||||
}
|
||||
|
||||
bool is_child() const {
|
||||
return id_parent != -1;
|
||||
}
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
|
|
|
|||
|
|
@ -491,16 +491,22 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
|
|||
def test_chat_completions_multiple_choices():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"n": 2,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["choices"]) == 2
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert choice["finish_reason"] == "length"
|
||||
# make sure cache can be reused across multiple choices and multiple requests
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/18663
|
||||
for _ in range(2):
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"n": 2,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
],
|
||||
# test forcing the same slot to be used
|
||||
# the scheduler should not be locked up in this case
|
||||
"id_slot": 0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["choices"]) == 2
|
||||
for choice in res.body["choices"]:
|
||||
assert "assistant" == choice["message"]["role"]
|
||||
assert choice["finish_reason"] == "length"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue