mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile # README.md # common/CMakeLists.txt # docs/backend/SYCL.md # docs/build.md # docs/docker.md # examples/export-lora/export-lora.cpp # examples/main/README.md # examples/main/main.cpp # examples/run/README.md # examples/run/run.cpp # examples/server/README.md # examples/simple-chat/simple-chat.cpp # ggml/CMakeLists.txt # ggml/src/ggml-hip/CMakeLists.txt # src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat-template.cpp
This commit is contained in:
commit
bec231422a
46 changed files with 4305 additions and 578 deletions
46
examples/llava/README-minicpmo2.6.md
Normal file
46
examples/llava/README-minicpmo2.6.md
Normal file
|
@ -0,0 +1,46 @@
|
|||
## MiniCPM-o 2.6
|
||||
Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
||||
cd llama.cpp
|
||||
git checkout minicpm-omni
|
||||
```
|
||||
|
||||
### Usage of MiniCPM-o 2.6
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
```
|
|
@ -721,6 +721,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
else if (ctx->minicpmv_version == 3) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
}
|
||||
|
@ -1056,6 +1059,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
hidden_size = 3584;
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
|
@ -2141,6 +2149,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|||
images[images.size()-1].push_back(patch);
|
||||
}
|
||||
}
|
||||
clip_image_u8_free(refine_image);
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
@ -2179,6 +2188,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
clip_image_f32_free(res);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
if (imgs[i][j] != nullptr) {
|
||||
clip_image_u8_free(imgs[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
else if (ctx->has_qwen2vl_merger) {
|
||||
|
@ -2435,6 +2451,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
|||
else if (ctx->minicpmv_version == 3) {
|
||||
n_patches = 64;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
n_patches = 64;
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
int patch_size = params.patch_size * 2;
|
||||
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||
|
@ -2614,8 +2633,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
int bucket_coords_h[70];
|
||||
int bucket_coords_w[70];
|
||||
int bucket_coords_h[1024];
|
||||
int bucket_coords_w[1024];
|
||||
for (int i = 0; i < pos_h; i++){
|
||||
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
||||
}
|
||||
|
@ -2643,6 +2662,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
else if (ctx->minicpmv_version == 3) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||
|
@ -2896,6 +2918,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
else if (ctx->minicpmv_version == 3) {
|
||||
return 3584;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
return 3584;
|
||||
}
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
|
|
|
@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||
return true;
|
||||
}
|
||||
|
||||
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||
int width = image->nx;
|
||||
int height = image->ny;
|
||||
int num_patches = (height / patch_size) * (width / patch_size);
|
||||
|
@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
else {
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
|
||||
if (!encoded) {
|
||||
|
@ -313,6 +307,9 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
|
|
|
@ -140,6 +140,9 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
|
@ -227,6 +230,9 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
|||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
@ -236,6 +242,9 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
|||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
|
@ -308,7 +317,6 @@ int main(int argc, char ** argv) {
|
|||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
|
|
|
@ -501,7 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
|||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
|
||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
@ -545,12 +545,19 @@ if args.use_f32:
|
|||
|
||||
minicpmv_version = args.minicpmv_version
|
||||
emb_dim = 4096
|
||||
block_count = 26
|
||||
if minicpmv_version == 1:
|
||||
emb_dim = 2304
|
||||
block_count = 26
|
||||
elif minicpmv_version == 2:
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
elif minicpmv_version == 3:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 4:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
|
@ -567,6 +574,9 @@ model = Idefics2VisionTransformer(vision_config)
|
|||
if minicpmv_version == 3:
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 4:
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
|
||||
processor = None
|
||||
# if model.attn_pool is not None:
|
||||
|
@ -587,7 +597,7 @@ elif args.minicpmv_projector is not None:
|
|||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_minicpmv_projector = True
|
||||
minicpmv_version = 3
|
||||
minicpmv_version = 4
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
|
@ -625,7 +635,6 @@ if has_vision_encoder:
|
|||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
block_count = 26
|
||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
if processor is not None:
|
||||
|
|
|
@ -8,7 +8,7 @@ ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
|
|||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
#include "chat-template.hpp"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -85,14 +86,6 @@ static void sigint_handler(int signo) {
|
|||
}
|
||||
#endif
|
||||
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg{role, content};
|
||||
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
@ -166,6 +159,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
|
@ -208,7 +202,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// auto enable conversation mode if chat template is available
|
||||
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
|
||||
const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
|
||||
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
||||
if (has_chat_template) {
|
||||
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
||||
|
@ -226,7 +220,7 @@ int main(int argc, char ** argv) {
|
|||
// print chat template example in conversation mode
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
|
||||
} else {
|
||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
|
@ -270,10 +264,18 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg{role, content};
|
||||
auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
};
|
||||
|
||||
{
|
||||
auto prompt = (params.conversation_mode && params.enable_chat_template)
|
||||
// format the system prompt in conversation mode (fallback to default if empty)
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
// otherwise use the prompt as is
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
|
@ -780,7 +782,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
if (params.enable_chat_template) {
|
||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||
chat_add_and_format("assistant", assistant_ss.str());
|
||||
}
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
|
@ -845,7 +847,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
bool format_chat = params.conversation_mode && params.enable_chat_template;
|
||||
std::string user_inp = format_chat
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
? chat_add_and_format("user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
|
||||
|
|
|
@ -103,24 +103,26 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include <termios.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
#include "linenoise.h"
|
||||
# include "linenoise.h"
|
||||
|
||||
#define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100
|
||||
#define LINENOISE_MAX_LINE 4096
|
||||
static std::vector<const char*> unsupported_term = {"dumb","cons25","emacs",nullptr};
|
||||
# include <ctype.h>
|
||||
# include <errno.h>
|
||||
# include <stdio.h>
|
||||
# include <string.h>
|
||||
# include <sys/file.h>
|
||||
# include <sys/ioctl.h>
|
||||
# include <sys/stat.h>
|
||||
# include <sys/types.h>
|
||||
# include <termios.h>
|
||||
# include <unistd.h>
|
||||
|
||||
# include <memory>
|
||||
# include <string>
|
||||
# include <vector>
|
||||
|
||||
# define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100
|
||||
# define LINENOISE_MAX_LINE 4096
|
||||
static std::vector<const char *> unsupported_term = { "dumb", "cons25", "emacs" };
|
||||
static linenoiseCompletionCallback *completionCallback = NULL;
|
||||
static linenoiseHintsCallback *hintsCallback = NULL;
|
||||
static linenoiseFreeHintsCallback *freeHintsCallback = NULL;
|
||||
|
@ -166,21 +168,58 @@ int linenoiseHistoryAdd(const char *line);
|
|||
#define REFRESH_ALL (REFRESH_CLEAN|REFRESH_WRITE) // Do both.
|
||||
static void refreshLine(struct linenoiseState *l);
|
||||
|
||||
class File {
|
||||
public:
|
||||
FILE * file = nullptr;
|
||||
|
||||
FILE * open(const std::string & filename, const char * mode) {
|
||||
file = fopen(filename.c_str(), mode);
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
int lock() {
|
||||
if (file) {
|
||||
fd = fileno(file);
|
||||
if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
|
||||
fd = -1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
~File() {
|
||||
if (fd >= 0) {
|
||||
flock(fd, LOCK_UN);
|
||||
}
|
||||
|
||||
if (file) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int fd = -1;
|
||||
};
|
||||
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
/* Debugging function. */
|
||||
#if 0
|
||||
static void lndebug(const char *fmt, ...) {
|
||||
static FILE *lndebug_fp = NULL;
|
||||
if (lndebug_fp == NULL) {
|
||||
lndebug_fp = fopen("/tmp/lndebug.txt", "a");
|
||||
static File file;
|
||||
if (file.file == nullptr) {
|
||||
file.open("/tmp/lndebug.txt", "a");
|
||||
}
|
||||
|
||||
if (lndebug_fp != NULL) {
|
||||
if (file.file != nullptr) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vfprintf(lndebug_fp, fmt, args);
|
||||
vfprintf(file.file, fmt, args);
|
||||
va_end(args);
|
||||
fflush(lndebug_fp);
|
||||
fflush(file.file);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
@ -213,8 +252,11 @@ void linenoiseSetMultiLine(int ml) {
|
|||
static int isUnsupportedTerm(void) {
|
||||
char *term = getenv("TERM");
|
||||
if (term == NULL) return 0;
|
||||
for (int j = 0; unsupported_term[j]; ++j)
|
||||
if (!strcasecmp(term, unsupported_term[j])) return 1;
|
||||
for (size_t j = 0; j < unsupported_term.size(); ++j) {
|
||||
if (!strcasecmp(term, unsupported_term[j])) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -334,17 +376,6 @@ static void linenoiseBeep(void) {
|
|||
fflush(stderr);
|
||||
}
|
||||
|
||||
/* ============================== Completion ================================ */
|
||||
|
||||
/* Free a list of completion option populated by linenoiseAddCompletion(). */
|
||||
static void freeCompletions(linenoiseCompletions *lc) {
|
||||
size_t i;
|
||||
for (i = 0; i < lc->len; i++)
|
||||
free(lc->cvec[i]);
|
||||
if (lc->cvec != NULL)
|
||||
free(lc->cvec);
|
||||
}
|
||||
|
||||
/* Called by completeLine() and linenoiseShow() to render the current
|
||||
* edited line with the proposed completion. If the current completion table
|
||||
* is already available, it is passed as second argument, otherwise the
|
||||
|
@ -353,9 +384,9 @@ static void freeCompletions(linenoiseCompletions *lc) {
|
|||
* Flags are the same as refreshLine*(), that is REFRESH_* macros. */
|
||||
static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags) {
|
||||
/* Obtain the table of completions if the caller didn't provide one. */
|
||||
linenoiseCompletions ctable = { 0, NULL };
|
||||
linenoiseCompletions ctable;
|
||||
if (lc == NULL) {
|
||||
completionCallback(ls->buf,&ctable);
|
||||
completionCallback(ls->buf, &ctable);
|
||||
lc = &ctable;
|
||||
}
|
||||
|
||||
|
@ -364,16 +395,17 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple
|
|||
struct linenoiseState saved = *ls;
|
||||
ls->len = ls->pos = strlen(lc->cvec[ls->completion_idx]);
|
||||
ls->buf = lc->cvec[ls->completion_idx];
|
||||
refreshLineWithFlags(ls,flags);
|
||||
refreshLineWithFlags(ls, flags);
|
||||
ls->len = saved.len;
|
||||
ls->pos = saved.pos;
|
||||
ls->buf = saved.buf;
|
||||
} else {
|
||||
refreshLineWithFlags(ls,flags);
|
||||
refreshLineWithFlags(ls, flags);
|
||||
}
|
||||
|
||||
/* Free the completions table if needed. */
|
||||
if (lc != &ctable) freeCompletions(&ctable);
|
||||
if (lc == &ctable) {
|
||||
ctable.to_free = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* This is an helper function for linenoiseEdit*() and is called when the
|
||||
|
@ -391,11 +423,11 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple
|
|||
* possible completions, and the caller should read for the next characters
|
||||
* from stdin. */
|
||||
static int completeLine(struct linenoiseState *ls, int keypressed) {
|
||||
linenoiseCompletions lc = { 0, NULL };
|
||||
linenoiseCompletions lc;
|
||||
int nwritten;
|
||||
char c = keypressed;
|
||||
|
||||
completionCallback(ls->buf,&lc);
|
||||
completionCallback(ls->buf, &lc);
|
||||
if (lc.len == 0) {
|
||||
linenoiseBeep();
|
||||
ls->in_completion = 0;
|
||||
|
@ -406,7 +438,7 @@ static int completeLine(struct linenoiseState *ls, int keypressed) {
|
|||
ls->in_completion = 1;
|
||||
ls->completion_idx = 0;
|
||||
} else {
|
||||
ls->completion_idx = (ls->completion_idx+1) % (lc.len+1);
|
||||
ls->completion_idx = (ls->completion_idx + 1) % (lc.len + 1);
|
||||
if (ls->completion_idx == lc.len) linenoiseBeep();
|
||||
}
|
||||
c = 0;
|
||||
|
@ -420,8 +452,7 @@ static int completeLine(struct linenoiseState *ls, int keypressed) {
|
|||
default:
|
||||
/* Update buffer and return */
|
||||
if (ls->completion_idx < lc.len) {
|
||||
nwritten = snprintf(ls->buf,ls->buflen,"%s",
|
||||
lc.cvec[ls->completion_idx]);
|
||||
nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]);
|
||||
ls->len = ls->pos = nwritten;
|
||||
}
|
||||
ls->in_completion = 0;
|
||||
|
@ -430,13 +461,12 @@ static int completeLine(struct linenoiseState *ls, int keypressed) {
|
|||
|
||||
/* Show completion or original buffer */
|
||||
if (ls->in_completion && ls->completion_idx < lc.len) {
|
||||
refreshLineWithCompletion(ls,&lc,REFRESH_ALL);
|
||||
refreshLineWithCompletion(ls, &lc, REFRESH_ALL);
|
||||
} else {
|
||||
refreshLine(ls);
|
||||
}
|
||||
}
|
||||
|
||||
freeCompletions(&lc);
|
||||
return c; /* Return last read character */
|
||||
}
|
||||
|
||||
|
@ -462,53 +492,25 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *fn) {
|
|||
* user typed <tab>. See the example.c source code for a very easy to
|
||||
* understand example. */
|
||||
void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) {
|
||||
size_t len = strlen(str);
|
||||
char *copy, **cvec;
|
||||
|
||||
copy = (char*) malloc(len + 1);
|
||||
if (copy == NULL) return;
|
||||
memcpy(copy,str,len+1);
|
||||
cvec = (char**) realloc(lc->cvec,sizeof(char*)*(lc->len+1));
|
||||
if (cvec == NULL) {
|
||||
free(copy);
|
||||
const size_t len = strlen(str);
|
||||
auto copy = std::make_unique<char[]>(len + 1);
|
||||
if (!copy) {
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy(copy.get(), str, len + 1);
|
||||
char ** cvec = static_cast<char **>(std::realloc(lc->cvec, sizeof(char *) * (lc->len + 1)));
|
||||
if (cvec == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
lc->cvec = cvec;
|
||||
lc->cvec[lc->len++] = copy;
|
||||
}
|
||||
|
||||
/* =========================== Line editing ================================= */
|
||||
|
||||
/* We define a very simple "append buffer" structure, that is an heap
|
||||
* allocated string where we can append to. This is useful in order to
|
||||
* write all the escape sequences in a buffer and flush them to the standard
|
||||
* output in a single call, to avoid flickering effects. */
|
||||
struct abuf {
|
||||
char *b;
|
||||
int len;
|
||||
};
|
||||
|
||||
static void abInit(struct abuf *ab) {
|
||||
ab->b = NULL;
|
||||
ab->len = 0;
|
||||
}
|
||||
|
||||
static void abAppend(struct abuf *ab, const char *s, int len) {
|
||||
char *new_ptr = (char*) realloc(ab->b,ab->len+len);
|
||||
|
||||
if (new_ptr == NULL) return;
|
||||
memcpy(new_ptr+ab->len,s,len);
|
||||
ab->b = new_ptr;
|
||||
ab->len += len;
|
||||
}
|
||||
|
||||
static void abFree(struct abuf *ab) {
|
||||
free(ab->b);
|
||||
lc->cvec[lc->len++] = copy.release();
|
||||
}
|
||||
|
||||
/* Helper of refreshSingleLine() and refreshMultiLine() to show hints
|
||||
* to the right of the prompt. */
|
||||
static void refreshShowHints(struct abuf * ab, struct linenoiseState * l, int plen) {
|
||||
static void refreshShowHints(std::string & ab, struct linenoiseState * l, int plen) {
|
||||
char seq[64];
|
||||
if (hintsCallback && plen+l->len < l->cols) {
|
||||
int color = -1, bold = 0;
|
||||
|
@ -522,10 +524,11 @@ static void refreshShowHints(struct abuf * ab, struct linenoiseState * l, int pl
|
|||
snprintf(seq,64,"\033[%d;%d;49m",bold,color);
|
||||
else
|
||||
seq[0] = '\0';
|
||||
abAppend(ab,seq,strlen(seq));
|
||||
abAppend(ab,hint,hintlen);
|
||||
ab.append(seq);
|
||||
ab.append(hint, hintlen);
|
||||
if (color != -1 || bold != 0)
|
||||
abAppend(ab,"\033[0m",4);
|
||||
ab.append("\033[0m");
|
||||
|
||||
/* Call the function to free the hint returned. */
|
||||
if (freeHintsCallback) freeHintsCallback(hint);
|
||||
}
|
||||
|
@ -546,8 +549,7 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
|
|||
char *buf = l->buf;
|
||||
size_t len = l->len;
|
||||
size_t pos = l->pos;
|
||||
struct abuf ab;
|
||||
|
||||
std::string ab;
|
||||
while((plen+pos) >= l->cols) {
|
||||
buf++;
|
||||
len--;
|
||||
|
@ -557,35 +559,34 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
|
|||
len--;
|
||||
}
|
||||
|
||||
abInit(&ab);
|
||||
/* Cursor to left edge */
|
||||
snprintf(seq,sizeof(seq),"\r");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
|
||||
if (flags & REFRESH_WRITE) {
|
||||
/* Write the prompt and the current buffer content */
|
||||
abAppend(&ab,l->prompt,strlen(l->prompt));
|
||||
ab.append(l->prompt);
|
||||
if (maskmode == 1) {
|
||||
while (len--) abAppend(&ab,"*",1);
|
||||
while (len--) {
|
||||
ab.append("*");
|
||||
}
|
||||
} else {
|
||||
abAppend(&ab,buf,len);
|
||||
ab.append(buf, len);
|
||||
}
|
||||
/* Show hits if any. */
|
||||
refreshShowHints(&ab,l,plen);
|
||||
refreshShowHints(ab, l, plen);
|
||||
}
|
||||
|
||||
/* Erase to right */
|
||||
snprintf(seq,sizeof(seq),"\x1b[0K");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
|
||||
ab.append(seq);
|
||||
if (flags & REFRESH_WRITE) {
|
||||
/* Move cursor to original position. */
|
||||
snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen));
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
|
||||
if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */
|
||||
abFree(&ab);
|
||||
(void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
|
||||
}
|
||||
|
||||
/* Multi line low level line refresh.
|
||||
|
@ -604,26 +605,23 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
|
|||
int col; /* colum position, zero-based. */
|
||||
int old_rows = l->oldrows;
|
||||
int fd = l->ofd, j;
|
||||
struct abuf ab;
|
||||
|
||||
std::string ab;
|
||||
l->oldrows = rows;
|
||||
|
||||
/* First step: clear all the lines used before. To do so start by
|
||||
* going to the last row. */
|
||||
abInit(&ab);
|
||||
|
||||
if (flags & REFRESH_CLEAN) {
|
||||
if (old_rows-rpos > 0) {
|
||||
lndebug("go down %d", old_rows-rpos);
|
||||
snprintf(seq,64,"\x1b[%dB", old_rows-rpos);
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
|
||||
/* Now for every row clear it, go up. */
|
||||
for (j = 0; j < old_rows-1; j++) {
|
||||
lndebug("clear+up");
|
||||
snprintf(seq,64,"\r\x1b[0K\x1b[1A");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -631,21 +629,22 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
|
|||
/* Clean the top line. */
|
||||
lndebug("clear");
|
||||
snprintf(seq,64,"\r\x1b[0K");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
|
||||
if (flags & REFRESH_WRITE) {
|
||||
/* Write the prompt and the current buffer content */
|
||||
abAppend(&ab,l->prompt,strlen(l->prompt));
|
||||
ab.append(l->prompt);
|
||||
if (maskmode == 1) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < l->len; i++) abAppend(&ab,"*",1);
|
||||
for (unsigned int i = 0; i < l->len; ++i) {
|
||||
ab.append("*");
|
||||
}
|
||||
} else {
|
||||
abAppend(&ab,l->buf,l->len);
|
||||
ab.append(l->buf, l->len);
|
||||
}
|
||||
|
||||
/* Show hits if any. */
|
||||
refreshShowHints(&ab,l,plen);
|
||||
refreshShowHints(ab, l, plen);
|
||||
|
||||
/* If we are at the very end of the screen with our prompt, we need to
|
||||
* emit a newline and move the prompt to the first column. */
|
||||
|
@ -654,9 +653,9 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
|
|||
(l->pos+plen) % l->cols == 0)
|
||||
{
|
||||
lndebug("<newline>");
|
||||
abAppend(&ab,"\n",1);
|
||||
ab.append("\n");
|
||||
snprintf(seq,64,"\r");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
rows++;
|
||||
if (rows > (int)l->oldrows) l->oldrows = rows;
|
||||
}
|
||||
|
@ -669,7 +668,7 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
|
|||
if (rows-rpos2 > 0) {
|
||||
lndebug("go-up %d", rows-rpos2);
|
||||
snprintf(seq,64,"\x1b[%dA", rows-rpos2);
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
|
||||
/* Set column. */
|
||||
|
@ -679,14 +678,12 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
|
|||
snprintf(seq,64,"\r\x1b[%dC", col);
|
||||
else
|
||||
snprintf(seq,64,"\r");
|
||||
abAppend(&ab,seq,strlen(seq));
|
||||
ab.append(seq);
|
||||
}
|
||||
|
||||
lndebug("\n");
|
||||
l->oldpos = l->pos;
|
||||
|
||||
if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */
|
||||
abFree(&ab);
|
||||
(void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
|
||||
}
|
||||
|
||||
/* Calls the two low level functions refreshSingleLine() or
|
||||
|
@ -1313,16 +1310,17 @@ int linenoiseHistorySetMaxLen(int len) {
|
|||
* otherwise -1 is returned. */
|
||||
int linenoiseHistorySave(const char *filename) {
|
||||
mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO);
|
||||
FILE *fp;
|
||||
int j;
|
||||
|
||||
fp = fopen(filename,"w");
|
||||
File file;
|
||||
file.open(filename, "w");
|
||||
umask(old_umask);
|
||||
if (fp == NULL) return -1;
|
||||
if (file.file == NULL) {
|
||||
return -1;
|
||||
}
|
||||
chmod(filename,S_IRUSR|S_IWUSR);
|
||||
for (j = 0; j < history_len; j++)
|
||||
fprintf(fp,"%s\n",history[j]);
|
||||
fclose(fp);
|
||||
for (int j = 0; j < history_len; ++j) {
|
||||
fprintf(file.file, "%s\n", history[j]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1332,12 +1330,14 @@ int linenoiseHistorySave(const char *filename) {
|
|||
* If the file exists and the operation succeeded 0 is returned, otherwise
|
||||
* on error -1 is returned. */
|
||||
int linenoiseHistoryLoad(const char *filename) {
|
||||
FILE *fp = fopen(filename,"r");
|
||||
File file;
|
||||
file.open(filename, "r");
|
||||
char buf[LINENOISE_MAX_LINE];
|
||||
if (file.file == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (fp == NULL) return -1;
|
||||
|
||||
while (fgets(buf,LINENOISE_MAX_LINE,fp) != NULL) {
|
||||
while (fgets(buf, LINENOISE_MAX_LINE, file.file) != NULL) {
|
||||
char *p;
|
||||
|
||||
p = strchr(buf,'\r');
|
||||
|
@ -1345,7 +1345,6 @@ int linenoiseHistoryLoad(const char *filename) {
|
|||
if (p) *p = '\0';
|
||||
linenoiseHistoryAdd(buf);
|
||||
}
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -45,6 +45,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#include <stddef.h> /* For size_t. */
|
||||
#include <stdlib.h>
|
||||
|
||||
extern const char *linenoiseEditMore;
|
||||
|
||||
|
@ -69,10 +70,23 @@ struct linenoiseState {
|
|||
int history_index; /* The history index we are currently editing. */
|
||||
};
|
||||
|
||||
typedef struct linenoiseCompletions {
|
||||
size_t len;
|
||||
char **cvec;
|
||||
} linenoiseCompletions;
|
||||
struct linenoiseCompletions {
|
||||
size_t len = 0;
|
||||
char ** cvec = nullptr;
|
||||
bool to_free = true;
|
||||
|
||||
~linenoiseCompletions() {
|
||||
if (!to_free) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
free(cvec[i]);
|
||||
}
|
||||
|
||||
free(cvec);
|
||||
}
|
||||
};
|
||||
|
||||
/* Non blocking API. */
|
||||
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
|
||||
|
|
Binary file not shown.
|
@ -267,6 +267,11 @@ struct server_task {
|
|||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||
|
||||
// Use OpenAI API logprobs only if n_probs wasn't provided
|
||||
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
|
||||
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
|
||||
}
|
||||
|
||||
if (data.contains("lora")) {
|
||||
if (data.at("lora").is_array()) {
|
||||
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
||||
|
@ -1428,6 +1433,10 @@ struct server_queue {
|
|||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
}
|
||||
|
@ -1445,6 +1454,10 @@ struct server_queue {
|
|||
} else {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
}
|
||||
condition_tasks.notify_one();
|
||||
return 0;
|
||||
|
@ -1539,6 +1552,20 @@ struct server_queue {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void cleanup_pending_task(int id_task) {
|
||||
// no need lock because this is called exclusively by post()
|
||||
auto rm_func = [id_task](const server_task & task) {
|
||||
return task.id_target == id_task;
|
||||
};
|
||||
queue_tasks.erase(
|
||||
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
|
||||
queue_tasks.end());
|
||||
queue_tasks_deferred.erase(
|
||||
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
|
||||
queue_tasks_deferred.end());
|
||||
}
|
||||
};
|
||||
|
||||
struct server_response {
|
||||
|
@ -1574,6 +1601,12 @@ struct server_response {
|
|||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(id_task);
|
||||
// make sure to clean up all pending results
|
||||
queue_results.erase(
|
||||
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
|
||||
return res->id == id_task;
|
||||
}),
|
||||
queue_results.end());
|
||||
}
|
||||
|
||||
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
|
||||
|
@ -1593,7 +1626,7 @@ struct server_response {
|
|||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++) {
|
||||
for (size_t i = 0; i < queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
|
@ -1610,12 +1643,6 @@ struct server_response {
|
|||
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
if (!cr_res) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
|
@ -1624,6 +1651,11 @@ struct server_response {
|
|||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
|
@ -1688,6 +1720,8 @@ struct server_context {
|
|||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
common_chat_templates chat_templates;
|
||||
|
||||
~server_context() {
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
|
@ -1728,13 +1762,16 @@ struct server_context {
|
|||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty()) {
|
||||
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
|
||||
params_dft.devices = params_base.speculative.devices;
|
||||
params_dft.hf_file = params_base.speculative.hf_file;
|
||||
params_dft.hf_repo = params_base.speculative.hf_repo;
|
||||
params_dft.model = params_base.speculative.model;
|
||||
params_dft.model_url = params_base.speculative.model_url;
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
|
@ -1762,16 +1799,44 @@ struct server_context {
|
|||
// force F16 KV cache for the draft model for extra performance
|
||||
cparams_dft.type_k = GGML_TYPE_F16;
|
||||
cparams_dft.type_v = GGML_TYPE_F16;
|
||||
|
||||
// the context is not needed - we will create one for each slot
|
||||
llama_init_dft.context.reset();
|
||||
}
|
||||
|
||||
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
||||
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool validate_builtin_chat_template() const {
|
||||
bool validate_builtin_chat_template(bool use_jinja) const {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
const char * tmpl = llama_model_chat_template(model);
|
||||
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
|
||||
if (use_jinja) {
|
||||
auto templates = common_chat_templates_from_model(model, "");
|
||||
GGML_ASSERT(templates.template_default);
|
||||
try {
|
||||
templates.template_default->apply({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}}, json(), true);
|
||||
if (templates.template_tool_use) {
|
||||
templates.template_tool_use->apply({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}}, json(), true);
|
||||
}
|
||||
return true;
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to apply template: %s\n", e.what());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
||||
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
}
|
||||
}
|
||||
|
||||
void init() {
|
||||
|
@ -2338,8 +2403,8 @@ struct server_context {
|
|||
|
||||
server_task task(SERVER_TASK_TYPE_CANCEL);
|
||||
task.id_target = id_task;
|
||||
cancel_tasks.push_back(task);
|
||||
queue_results.remove_waiting_task_id(id_task);
|
||||
cancel_tasks.push_back(task);
|
||||
}
|
||||
// push to beginning of the queue, so it has highest priority
|
||||
queue_tasks.post(cancel_tasks, true);
|
||||
|
@ -3656,9 +3721,12 @@ int main(int argc, char ** argv) {
|
|||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
|
||||
{ "chat_template", ctx_server.chat_templates.template_default->source() },
|
||||
{ "build_info", build_info },
|
||||
};
|
||||
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
|
||||
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
|
||||
}
|
||||
|
||||
res_ok(res, data);
|
||||
};
|
||||
|
@ -3886,7 +3954,10 @@ int main(int argc, char ** argv) {
|
|||
return;
|
||||
}
|
||||
|
||||
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
auto body = json::parse(req.body);
|
||||
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
|
||||
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
|
||||
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
|
@ -4296,7 +4367,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||
if (params.chat_template.empty()) {
|
||||
if (!ctx_server.validate_builtin_chat_template()) {
|
||||
if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
|
||||
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
params.chat_template = "chatml";
|
||||
}
|
||||
|
@ -4304,8 +4375,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// print sample chat example to make it clear which template is used
|
||||
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
||||
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
|
||||
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||
ctx_server.chat_templates.template_default->source().c_str(),
|
||||
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||
|
|
|
@ -4,22 +4,26 @@ from utils import *
|
|||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
|
||||
[
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
|
||||
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
|
||||
]
|
||||
)
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
|
||||
global server
|
||||
server.jinja = jinja
|
||||
server.chat_template = chat_template
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"model": model,
|
||||
|
|
|
@ -72,13 +72,14 @@ class ServerProcess:
|
|||
pooling: str | None = None
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
response_format: str | None = None
|
||||
lora_files: List[str] | None = None
|
||||
disable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
draft_max: int | None = None
|
||||
no_webui: bool | None = None
|
||||
jinja: bool | None = None
|
||||
chat_template: str | None = None
|
||||
chat_template_file: str | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
|
@ -169,8 +170,12 @@ class ServerProcess:
|
|||
server_args.extend(["--draft-min", self.draft_min])
|
||||
if self.no_webui:
|
||||
server_args.append("--no-webui")
|
||||
if self.jinja:
|
||||
server_args.append("--jinja")
|
||||
if self.chat_template:
|
||||
server_args.extend(["--chat-template", self.chat_template])
|
||||
if self.chat_template_file:
|
||||
server_args.extend(["--chat-template-file", self.chat_template_file])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "minja.hpp"
|
||||
#include "chat-template.hpp"
|
||||
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
|
@ -349,7 +351,7 @@ static llama_tokens format_infill(
|
|||
}
|
||||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
||||
std::vector<common_chat_msg> chat;
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
|
@ -377,7 +379,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
||||
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
||||
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||
|
||||
return formatted_chat;
|
||||
|
@ -576,14 +578,23 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|||
return llama_params;
|
||||
}
|
||||
|
||||
static json oaicompat_chat_completion_params_parse(
|
||||
const struct llama_model * model,
|
||||
const json & body, /* openai api json semantics */
|
||||
const std::string & chat_template) {
|
||||
static json oaicompat_completion_params_parse(
|
||||
const json & body, /* openai api json semantics */
|
||||
const common_chat_template & tmpl,
|
||||
bool use_jinja)
|
||||
{
|
||||
json llama_params;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||
auto tools = json_value(body, "tools", json());
|
||||
auto has_tools = tools.is_array() && !tools.empty();
|
||||
|
||||
if (has_tools) {
|
||||
if (use_jinja) {
|
||||
LOG_WRN("tools param is not fully supported yet\n");
|
||||
} else {
|
||||
throw std::runtime_error("tools param requires --jinja flag");
|
||||
}
|
||||
}
|
||||
|
||||
// Handle "stop" field
|
||||
if (body.contains("stop") && body.at("stop").is_string()) {
|
||||
|
@ -606,6 +617,13 @@ static json oaicompat_chat_completion_params_parse(
|
|||
}
|
||||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
if (use_jinja) {
|
||||
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
|
||||
} else {
|
||||
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
||||
}
|
||||
|
||||
// Handle "n" field
|
||||
int n_choices = json_value(body, "n", 1);
|
||||
if (n_choices != 1) {
|
||||
|
@ -621,7 +639,7 @@ static json oaicompat_chat_completion_params_parse(
|
|||
}
|
||||
|
||||
// Params supported by OAI but unsupported by llama.cpp
|
||||
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
||||
static const std::vector<std::string> unsupported_params { "tool_choice" };
|
||||
for (const auto & param : unsupported_params) {
|
||||
if (body.contains(param)) {
|
||||
throw std::runtime_error("Unsupported param: " + param);
|
||||
|
|
|
@ -141,6 +141,7 @@
|
|||
:msg="pendingMsg"
|
||||
:key="pendingMsg.id"
|
||||
:is-generating="isGenerating"
|
||||
:show-thought-in-progress="config.showThoughtInProgress"
|
||||
:edit-user-msg-and-regenerate="() => {}"
|
||||
:regenerate-msg="() => {}"></message-bubble>
|
||||
</div>
|
||||
|
@ -202,6 +203,20 @@
|
|||
</template>
|
||||
</div>
|
||||
</details>
|
||||
<!-- Section: Reasoning models -->
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Reasoning models</summary>
|
||||
<div class="collapse-content">
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
|
||||
<span class="ml-4">Expand though process by default for generating message</span>
|
||||
</div>
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
|
||||
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
<!-- Section: Advanced config -->
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Advanced config</summary>
|
||||
|
@ -261,7 +276,17 @@
|
|||
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
|
||||
<!-- render message as markdown -->
|
||||
<div v-else dir="auto">
|
||||
<vue-markdown :source="msg.content"></vue-markdown>
|
||||
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
|
||||
<summary class="collapse-title">
|
||||
<span v-if="splitMsgContent.isThinking">
|
||||
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
|
||||
<b>Thinking</b>
|
||||
</span>
|
||||
<b v-else>Thought Process</b>
|
||||
</summary>
|
||||
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
|
||||
</details>
|
||||
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
|
||||
</div>
|
||||
<!-- render timings if enabled -->
|
||||
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
|
||||
|
|
|
@ -17,6 +17,11 @@ import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
|
|||
|
||||
const isDev = import.meta.env.MODE === 'development';
|
||||
|
||||
// types
|
||||
/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
|
||||
/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
|
||||
/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
|
||||
|
||||
// utility functions
|
||||
const isString = (x) => !!x.toLowerCase;
|
||||
const isBoolean = (x) => x === true || x === false;
|
||||
|
@ -50,6 +55,8 @@ const CONFIG_DEFAULT = {
|
|||
apiKey: '',
|
||||
systemMessage: 'You are a helpful assistant.',
|
||||
showTokensPerSecond: false,
|
||||
showThoughtInProgress: false,
|
||||
excludeThoughtOnReq: true,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'edkypmxt',
|
||||
temperature: 0.8,
|
||||
|
@ -172,6 +179,7 @@ const MessageBubble = defineComponent({
|
|||
config: Object,
|
||||
msg: Object,
|
||||
isGenerating: Boolean,
|
||||
showThoughtInProgress: Boolean,
|
||||
editUserMsgAndRegenerate: Function,
|
||||
regenerateMsg: Function,
|
||||
},
|
||||
|
@ -188,7 +196,31 @@ const MessageBubble = defineComponent({
|
|||
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
|
||||
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
|
||||
};
|
||||
}
|
||||
},
|
||||
splitMsgContent() {
|
||||
const content = this.msg.content;
|
||||
if (this.msg.role !== 'assistant') {
|
||||
return { content };
|
||||
}
|
||||
let actualContent = '';
|
||||
let cot = '';
|
||||
let isThinking = false;
|
||||
let thinkSplit = content.split('<think>', 2);
|
||||
actualContent += thinkSplit[0];
|
||||
while (thinkSplit[1] !== undefined) {
|
||||
// <think> tag found
|
||||
thinkSplit = thinkSplit[1].split('</think>', 2);
|
||||
cot += thinkSplit[0];
|
||||
isThinking = true;
|
||||
if (thinkSplit[1] !== undefined) {
|
||||
// </think> closing tag found
|
||||
isThinking = false;
|
||||
thinkSplit = thinkSplit[1].split('<think>', 2);
|
||||
actualContent += thinkSplit[0];
|
||||
}
|
||||
}
|
||||
return { content: actualContent, cot, isThinking };
|
||||
},
|
||||
},
|
||||
methods: {
|
||||
copyMsg() {
|
||||
|
@ -208,7 +240,10 @@ const MessageBubble = defineComponent({
|
|||
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
|
||||
// convId is a string prefixed with 'conv-'
|
||||
const StorageUtils = {
|
||||
// manage conversations
|
||||
/**
|
||||
* manage conversations
|
||||
* @returns {Array<Conversation>}
|
||||
*/
|
||||
getAllConversations() {
|
||||
const res = [];
|
||||
for (const key in localStorage) {
|
||||
|
@ -219,11 +254,19 @@ const StorageUtils = {
|
|||
res.sort((a, b) => b.lastModified - a.lastModified);
|
||||
return res;
|
||||
},
|
||||
// can return null if convId does not exist
|
||||
/**
|
||||
* can return null if convId does not exist
|
||||
* @param {string} convId
|
||||
* @returns {Conversation | null}
|
||||
*/
|
||||
getOneConversation(convId) {
|
||||
return JSON.parse(localStorage.getItem(convId) || 'null');
|
||||
},
|
||||
// if convId does not exist, create one
|
||||
/**
|
||||
* if convId does not exist, create one
|
||||
* @param {string} convId
|
||||
* @param {Message} msg
|
||||
*/
|
||||
appendMsg(convId, msg) {
|
||||
if (msg.content === null) return;
|
||||
const conv = StorageUtils.getOneConversation(convId) || {
|
||||
|
@ -235,12 +278,24 @@ const StorageUtils = {
|
|||
conv.lastModified = Date.now();
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
},
|
||||
/**
|
||||
* Get new conversation id
|
||||
* @returns {string}
|
||||
*/
|
||||
getNewConvId() {
|
||||
return `conv-${Date.now()}`;
|
||||
},
|
||||
/**
|
||||
* remove conversation by id
|
||||
* @param {string} convId
|
||||
*/
|
||||
remove(convId) {
|
||||
localStorage.removeItem(convId);
|
||||
},
|
||||
/**
|
||||
* remove all conversations
|
||||
* @param {string} convId
|
||||
*/
|
||||
filterAndKeepMsgs(convId, predicate) {
|
||||
const conv = StorageUtils.getOneConversation(convId);
|
||||
if (!conv) return;
|
||||
|
@ -248,6 +303,11 @@ const StorageUtils = {
|
|||
conv.lastModified = Date.now();
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
},
|
||||
/**
|
||||
* remove last message from conversation
|
||||
* @param {string} convId
|
||||
* @returns {Message | undefined}
|
||||
*/
|
||||
popMsg(convId) {
|
||||
const conv = StorageUtils.getOneConversation(convId);
|
||||
if (!conv) return;
|
||||
|
@ -322,10 +382,12 @@ const mainApp = createApp({
|
|||
data() {
|
||||
return {
|
||||
conversations: StorageUtils.getAllConversations(),
|
||||
messages: [], // { id: number, role: 'user' | 'assistant', content: string }
|
||||
/** @type {Array<Message>} */
|
||||
messages: [],
|
||||
viewingConvId: StorageUtils.getNewConvId(),
|
||||
inputMsg: '',
|
||||
isGenerating: false,
|
||||
/** @type {Array<Message> | null} */
|
||||
pendingMsg: null, // the on-going message from assistant
|
||||
stopGeneration: () => {},
|
||||
selectedTheme: StorageUtils.getTheme(),
|
||||
|
@ -333,6 +395,7 @@ const mainApp = createApp({
|
|||
showConfigDialog: false,
|
||||
// const
|
||||
themes: THEMES,
|
||||
/** @type {CONFIG_DEFAULT} */
|
||||
configDefault: {...CONFIG_DEFAULT},
|
||||
configInfo: {...CONFIG_INFO},
|
||||
isDev,
|
||||
|
@ -425,42 +488,50 @@ const mainApp = createApp({
|
|||
this.isGenerating = true;
|
||||
|
||||
try {
|
||||
/** @type {CONFIG_DEFAULT} */
|
||||
const config = this.config;
|
||||
const abortController = new AbortController();
|
||||
this.stopGeneration = () => abortController.abort();
|
||||
/** @type {Array<APIMessage>} */
|
||||
let messages = [
|
||||
{ role: 'system', content: config.systemMessage },
|
||||
...normalizeMsgsForAPI(this.messages),
|
||||
];
|
||||
if (config.excludeThoughtOnReq) {
|
||||
messages = filterThoughtFromMsgs(messages);
|
||||
}
|
||||
if (isDev) console.log({messages});
|
||||
const params = {
|
||||
messages: [
|
||||
{ role: 'system', content: this.config.systemMessage },
|
||||
...this.messages,
|
||||
],
|
||||
messages,
|
||||
stream: true,
|
||||
cache_prompt: true,
|
||||
samplers: this.config.samplers,
|
||||
temperature: this.config.temperature,
|
||||
dynatemp_range: this.config.dynatemp_range,
|
||||
dynatemp_exponent: this.config.dynatemp_exponent,
|
||||
top_k: this.config.top_k,
|
||||
top_p: this.config.top_p,
|
||||
min_p: this.config.min_p,
|
||||
typical_p: this.config.typical_p,
|
||||
xtc_probability: this.config.xtc_probability,
|
||||
xtc_threshold: this.config.xtc_threshold,
|
||||
repeat_last_n: this.config.repeat_last_n,
|
||||
repeat_penalty: this.config.repeat_penalty,
|
||||
presence_penalty: this.config.presence_penalty,
|
||||
frequency_penalty: this.config.frequency_penalty,
|
||||
dry_multiplier: this.config.dry_multiplier,
|
||||
dry_base: this.config.dry_base,
|
||||
dry_allowed_length: this.config.dry_allowed_length,
|
||||
dry_penalty_last_n: this.config.dry_penalty_last_n,
|
||||
max_tokens: this.config.max_tokens,
|
||||
timings_per_token: !!this.config.showTokensPerSecond,
|
||||
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
|
||||
samplers: config.samplers,
|
||||
temperature: config.temperature,
|
||||
dynatemp_range: config.dynatemp_range,
|
||||
dynatemp_exponent: config.dynatemp_exponent,
|
||||
top_k: config.top_k,
|
||||
top_p: config.top_p,
|
||||
min_p: config.min_p,
|
||||
typical_p: config.typical_p,
|
||||
xtc_probability: config.xtc_probability,
|
||||
xtc_threshold: config.xtc_threshold,
|
||||
repeat_last_n: config.repeat_last_n,
|
||||
repeat_penalty: config.repeat_penalty,
|
||||
presence_penalty: config.presence_penalty,
|
||||
frequency_penalty: config.frequency_penalty,
|
||||
dry_multiplier: config.dry_multiplier,
|
||||
dry_base: config.dry_base,
|
||||
dry_allowed_length: config.dry_allowed_length,
|
||||
dry_penalty_last_n: config.dry_penalty_last_n,
|
||||
max_tokens: config.max_tokens,
|
||||
timings_per_token: !!config.showTokensPerSecond,
|
||||
...(config.custom.length ? JSON.parse(config.custom) : {}),
|
||||
};
|
||||
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
|
||||
...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
|
||||
},
|
||||
body: JSON.stringify(params),
|
||||
signal: abortController.signal,
|
||||
|
@ -477,7 +548,7 @@ const mainApp = createApp({
|
|||
};
|
||||
}
|
||||
const timings = chunk.timings;
|
||||
if (timings && this.config.showTokensPerSecond) {
|
||||
if (timings && config.showTokensPerSecond) {
|
||||
// only extract what's really needed, to save some space
|
||||
this.pendingMsg.timings = {
|
||||
prompt_n: timings.prompt_n,
|
||||
|
@ -598,3 +669,33 @@ try {
|
|||
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* filter out redundant fields upon sending to API
|
||||
* @param {Array<APIMessage>} messages
|
||||
* @returns {Array<APIMessage>}
|
||||
*/
|
||||
function normalizeMsgsForAPI(messages) {
|
||||
return messages.map((msg) => {
|
||||
return {
|
||||
role: msg.role,
|
||||
content: msg.content,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
|
||||
* @param {Array<APIMessage>} messages
|
||||
* @returns {Array<APIMessage>}
|
||||
*/
|
||||
function filterThoughtFromMsgs(messages) {
|
||||
return messages.map((msg) => {
|
||||
return {
|
||||
role: msg.role,
|
||||
content: msg.role === 'assistant'
|
||||
? msg.content.split('</think>').at(-1).trim()
|
||||
: msg.content,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue