Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	common/CMakeLists.txt
#	docs/backend/SYCL.md
#	docs/build.md
#	docs/docker.md
#	examples/export-lora/export-lora.cpp
#	examples/main/README.md
#	examples/main/main.cpp
#	examples/run/README.md
#	examples/run/run.cpp
#	examples/server/README.md
#	examples/simple-chat/simple-chat.cpp
#	ggml/CMakeLists.txt
#	ggml/src/ggml-hip/CMakeLists.txt
#	src/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
This commit is contained in:
Concedo 2025-01-25 14:16:50 +08:00
commit bec231422a
46 changed files with 4305 additions and 578 deletions

Binary file not shown.

View file

@ -267,6 +267,11 @@ struct server_task {
params.speculative.n_min = std::max(params.speculative.n_min, 2);
params.speculative.n_max = std::max(params.speculative.n_max, 0);
// Use OpenAI API logprobs only if n_probs wasn't provided
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
}
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
@ -1428,6 +1433,10 @@ struct server_queue {
} else {
queue_tasks.push_back(std::move(task));
}
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
condition_tasks.notify_one();
return task.id;
}
@ -1445,6 +1454,10 @@ struct server_queue {
} else {
queue_tasks.push_back(std::move(task));
}
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
}
condition_tasks.notify_one();
return 0;
@ -1539,6 +1552,20 @@ struct server_queue {
}
}
}
private:
void cleanup_pending_task(int id_task) {
// no need lock because this is called exclusively by post()
auto rm_func = [id_task](const server_task & task) {
return task.id_target == id_task;
};
queue_tasks.erase(
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
queue_tasks.end());
queue_tasks_deferred.erase(
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
queue_tasks_deferred.end());
}
};
struct server_response {
@ -1574,6 +1601,12 @@ struct server_response {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(id_task);
// make sure to clean up all pending results
queue_results.erase(
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
return res->id == id_task;
}),
queue_results.end());
}
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
@ -1593,7 +1626,7 @@ struct server_response {
return !queue_results.empty();
});
for (int i = 0; i < (int) queue_results.size(); i++) {
for (size_t i = 0; i < queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
queue_results.erase(queue_results.begin() + i);
@ -1610,12 +1643,6 @@ struct server_response {
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
return !queue_results.empty();
});
if (!cr_res) {
return nullptr;
}
for (int i = 0; i < (int) queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
@ -1624,6 +1651,11 @@ struct server_response {
return res;
}
}
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
if (cr_res == std::cv_status::timeout) {
return nullptr;
}
}
// should never reach here
@ -1688,6 +1720,8 @@ struct server_context {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
common_chat_templates chat_templates;
~server_context() {
// Clear any sampling context
for (server_slot & slot : slots) {
@ -1728,13 +1762,16 @@ struct server_context {
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.empty()) {
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.hf_file = params_base.speculative.hf_file;
params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
@ -1762,16 +1799,44 @@ struct server_context {
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_init_dft.context.reset();
}
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
return true;
}
bool validate_builtin_chat_template() const {
bool validate_builtin_chat_template(bool use_jinja) const {
llama_chat_message chat[] = {{"user", "test"}};
const char * tmpl = llama_model_chat_template(model);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
if (use_jinja) {
auto templates = common_chat_templates_from_model(model, "");
GGML_ASSERT(templates.template_default);
try {
templates.template_default->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
if (templates.template_tool_use) {
templates.template_tool_use->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
}
return true;
} catch (const std::exception & e) {
SRV_ERR("failed to apply template: %s\n", e.what());
return false;
}
} else {
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
}
void init() {
@ -2338,8 +2403,8 @@ struct server_context {
server_task task(SERVER_TASK_TYPE_CANCEL);
task.id_target = id_task;
cancel_tasks.push_back(task);
queue_results.remove_waiting_task_id(id_task);
cancel_tasks.push_back(task);
}
// push to beginning of the queue, so it has highest priority
queue_tasks.post(cancel_tasks, true);
@ -3656,9 +3721,12 @@ int main(int argc, char ** argv) {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model },
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
{ "chat_template", ctx_server.chat_templates.template_default->source() },
{ "build_info", build_info },
};
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
}
res_ok(res, data);
};
@ -3886,7 +3954,10 @@ int main(int argc, char ** argv) {
return;
}
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
auto body = json::parse(req.body);
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
@ -4296,7 +4367,7 @@ int main(int argc, char ** argv) {
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_builtin_chat_template()) {
if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
params.chat_template = "chatml";
}
@ -4304,8 +4375,8 @@ int main(int argc, char ** argv) {
// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
ctx_server.chat_templates.template_default->source().c_str(),
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));

View file

@ -4,22 +4,26 @@ from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
@pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
[
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
]
)
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
global server
server.jinja = jinja
server.chat_template = chat_template
server.start()
res = server.make_request("POST", "/chat/completions", data={
"model": model,

View file

@ -72,13 +72,14 @@ class ServerProcess:
pooling: str | None = None
draft: int | None = None
api_key: str | None = None
response_format: str | None = None
lora_files: List[str] | None = None
disable_ctx_shift: int | None = False
draft_min: int | None = None
draft_max: int | None = None
no_webui: bool | None = None
jinja: bool | None = None
chat_template: str | None = None
chat_template_file: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -169,8 +170,12 @@ class ServerProcess:
server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
if self.jinja:
server_args.append("--jinja")
if self.chat_template:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
server_args.extend(["--chat-template-file", self.chat_template_file])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")

View file

@ -16,6 +16,8 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "minja.hpp"
#include "chat-template.hpp"
#include <random>
#include <sstream>
@ -349,7 +351,7 @@ static llama_tokens format_infill(
}
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
std::vector<common_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) {
@ -377,7 +379,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content});
}
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat;
@ -576,14 +578,23 @@ static json oaicompat_completion_params_parse(const json & body) {
return llama_params;
}
static json oaicompat_chat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
static json oaicompat_completion_params_parse(
const json & body, /* openai api json semantics */
const common_chat_template & tmpl,
bool use_jinja)
{
json llama_params;
// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
auto tools = json_value(body, "tools", json());
auto has_tools = tools.is_array() && !tools.empty();
if (has_tools) {
if (use_jinja) {
LOG_WRN("tools param is not fully supported yet\n");
} else {
throw std::runtime_error("tools param requires --jinja flag");
}
}
// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
@ -606,6 +617,13 @@ static json oaicompat_chat_completion_params_parse(
}
}
// Apply chat template to the list of messages
if (use_jinja) {
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
} else {
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
}
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
@ -621,7 +639,7 @@ static json oaicompat_chat_completion_params_parse(
}
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
static const std::vector<std::string> unsupported_params { "tool_choice" };
for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);

View file

@ -141,6 +141,7 @@
:msg="pendingMsg"
:key="pendingMsg.id"
:is-generating="isGenerating"
:show-thought-in-progress="config.showThoughtInProgress"
:edit-user-msg-and-regenerate="() => {}"
:regenerate-msg="() => {}"></message-bubble>
</div>
@ -202,6 +203,20 @@
</template>
</div>
</details>
<!-- Section: Reasoning models -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Reasoning models</summary>
<div class="collapse-content">
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
<span class="ml-4">Expand though process by default for generating message</span>
</div>
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
</div>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
@ -261,7 +276,17 @@
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown -->
<div v-else dir="auto">
<vue-markdown :source="msg.content"></vue-markdown>
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
<summary class="collapse-title">
<span v-if="splitMsgContent.isThinking">
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
<b>Thinking</b>
</span>
<b v-else>Thought Process</b>
</summary>
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
</details>
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
</div>
<!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">

View file

@ -17,6 +17,11 @@ import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
const isDev = import.meta.env.MODE === 'development';
// types
/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
// utility functions
const isString = (x) => !!x.toLowerCase;
const isBoolean = (x) => x === true || x === false;
@ -50,6 +55,8 @@ const CONFIG_DEFAULT = {
apiKey: '',
systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
@ -172,6 +179,7 @@ const MessageBubble = defineComponent({
config: Object,
msg: Object,
isGenerating: Boolean,
showThoughtInProgress: Boolean,
editUserMsgAndRegenerate: Function,
regenerateMsg: Function,
},
@ -188,7 +196,31 @@ const MessageBubble = defineComponent({
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
};
}
},
splitMsgContent() {
const content = this.msg.content;
if (this.msg.role !== 'assistant') {
return { content };
}
let actualContent = '';
let cot = '';
let isThinking = false;
let thinkSplit = content.split('<think>', 2);
actualContent += thinkSplit[0];
while (thinkSplit[1] !== undefined) {
// <think> tag found
thinkSplit = thinkSplit[1].split('</think>', 2);
cot += thinkSplit[0];
isThinking = true;
if (thinkSplit[1] !== undefined) {
// </think> closing tag found
isThinking = false;
thinkSplit = thinkSplit[1].split('<think>', 2);
actualContent += thinkSplit[0];
}
}
return { content: actualContent, cot, isThinking };
},
},
methods: {
copyMsg() {
@ -208,7 +240,10 @@ const MessageBubble = defineComponent({
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
const StorageUtils = {
// manage conversations
/**
* manage conversations
* @returns {Array<Conversation>}
*/
getAllConversations() {
const res = [];
for (const key in localStorage) {
@ -219,11 +254,19 @@ const StorageUtils = {
res.sort((a, b) => b.lastModified - a.lastModified);
return res;
},
// can return null if convId does not exist
/**
* can return null if convId does not exist
* @param {string} convId
* @returns {Conversation | null}
*/
getOneConversation(convId) {
return JSON.parse(localStorage.getItem(convId) || 'null');
},
// if convId does not exist, create one
/**
* if convId does not exist, create one
* @param {string} convId
* @param {Message} msg
*/
appendMsg(convId, msg) {
if (msg.content === null) return;
const conv = StorageUtils.getOneConversation(convId) || {
@ -235,12 +278,24 @@ const StorageUtils = {
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* Get new conversation id
* @returns {string}
*/
getNewConvId() {
return `conv-${Date.now()}`;
},
/**
* remove conversation by id
* @param {string} convId
*/
remove(convId) {
localStorage.removeItem(convId);
},
/**
* remove all conversations
* @param {string} convId
*/
filterAndKeepMsgs(convId, predicate) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
@ -248,6 +303,11 @@ const StorageUtils = {
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* remove last message from conversation
* @param {string} convId
* @returns {Message | undefined}
*/
popMsg(convId) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
@ -322,10 +382,12 @@ const mainApp = createApp({
data() {
return {
conversations: StorageUtils.getAllConversations(),
messages: [], // { id: number, role: 'user' | 'assistant', content: string }
/** @type {Array<Message>} */
messages: [],
viewingConvId: StorageUtils.getNewConvId(),
inputMsg: '',
isGenerating: false,
/** @type {Array<Message> | null} */
pendingMsg: null, // the on-going message from assistant
stopGeneration: () => {},
selectedTheme: StorageUtils.getTheme(),
@ -333,6 +395,7 @@ const mainApp = createApp({
showConfigDialog: false,
// const
themes: THEMES,
/** @type {CONFIG_DEFAULT} */
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
isDev,
@ -425,42 +488,50 @@ const mainApp = createApp({
this.isGenerating = true;
try {
/** @type {CONFIG_DEFAULT} */
const config = this.config;
const abortController = new AbortController();
this.stopGeneration = () => abortController.abort();
/** @type {Array<APIMessage>} */
let messages = [
{ role: 'system', content: config.systemMessage },
...normalizeMsgsForAPI(this.messages),
];
if (config.excludeThoughtOnReq) {
messages = filterThoughtFromMsgs(messages);
}
if (isDev) console.log({messages});
const params = {
messages: [
{ role: 'system', content: this.config.systemMessage },
...this.messages,
],
messages,
stream: true,
cache_prompt: true,
samplers: this.config.samplers,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
top_k: this.config.top_k,
top_p: this.config.top_p,
min_p: this.config.min_p,
typical_p: this.config.typical_p,
xtc_probability: this.config.xtc_probability,
xtc_threshold: this.config.xtc_threshold,
repeat_last_n: this.config.repeat_last_n,
repeat_penalty: this.config.repeat_penalty,
presence_penalty: this.config.presence_penalty,
frequency_penalty: this.config.frequency_penalty,
dry_multiplier: this.config.dry_multiplier,
dry_base: this.config.dry_base,
dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens,
timings_per_token: !!this.config.showTokensPerSecond,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,
dynatemp_exponent: config.dynatemp_exponent,
top_k: config.top_k,
top_p: config.top_p,
min_p: config.min_p,
typical_p: config.typical_p,
xtc_probability: config.xtc_probability,
xtc_threshold: config.xtc_threshold,
repeat_last_n: config.repeat_last_n,
repeat_penalty: config.repeat_penalty,
presence_penalty: config.presence_penalty,
frequency_penalty: config.frequency_penalty,
dry_multiplier: config.dry_multiplier,
dry_base: config.dry_base,
dry_allowed_length: config.dry_allowed_length,
dry_penalty_last_n: config.dry_penalty_last_n,
max_tokens: config.max_tokens,
timings_per_token: !!config.showTokensPerSecond,
...(config.custom.length ? JSON.parse(config.custom) : {}),
};
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
},
body: JSON.stringify(params),
signal: abortController.signal,
@ -477,7 +548,7 @@ const mainApp = createApp({
};
}
const timings = chunk.timings;
if (timings && this.config.showTokensPerSecond) {
if (timings && config.showTokensPerSecond) {
// only extract what's really needed, to save some space
this.pendingMsg.timings = {
prompt_n: timings.prompt_n,
@ -598,3 +669,33 @@ try {
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
</div>`;
}
/**
* filter out redundant fields upon sending to API
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function normalizeMsgsForAPI(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.content,
};
});
}
/**
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function filterThoughtFromMsgs(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.role === 'assistant'
? msg.content.split('</think>').at(-1).trim()
: msg.content,
};
});
}