mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 23:04:36 +00:00
Add speculative decoding support to the server and command-line interfaces
This commit is contained in:
parent
1ea2d61a97
commit
2e8e42a5ad
11 changed files with 591 additions and 31 deletions
|
@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
gpt_init();
|
||||
|
||||
if (params.model_draft.empty()) {
|
||||
if (params.speculative.model.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ int main(int argc, char ** argv) {
|
|||
// load the draft model
|
||||
// make a hard copy of params to use for the draft model
|
||||
gpt_params params_draft = params;
|
||||
params_draft.model = params_draft.model_draft;
|
||||
params_draft.model = params_draft.speculative.model;
|
||||
params_draft.n_gpu_layers = params_draft.n_gpu_layers_draft;
|
||||
params_draft.n_world = 1; // do not split the draft model across devices
|
||||
params_draft.rank = 0; // always load the draft model on the head device
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue