mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-16 19:59:16 +00:00
* webui: Move static build output from `tools/server/public` to `build/ui` directory * refactor: Move to `tools/ui` * refactor: rename CMake variables and preprocessor defines - Rename LLAMA_BUILD_WEBUI -> LLAMA_BUILD_UI (old kept as deprecated) - Rename LLAMA_USE_PREBUILT_WEBUI -> LLAMA_USE_PREBUILT_UI (old kept as deprecated) - Backward compat: old vars auto-forward to new ones with DEPRECATION warning - Rename internal vars: WEBUI_SOURCE -> UI_SOURCE, WEBUI_SOURCE_DIR -> UI_SOURCE_DIR, etc. - Rename HF bucket: LLAMA_WEBUI_HF_BUCKET -> LLAMA_UI_HF_BUCKET - Emit both LLAMA_BUILD_WEBUI and LLAMA_BUILD_UI preprocessor defines - Emit both LLAMA_WEBUI_DEFAULT_ENABLED and LLAMA_UI_DEFAULT_ENABLED * refactor: rename CLI flags (--webui -> --ui) with backward compat - Add --ui/--no-ui (old --webui/--no-webui kept as deprecated aliases) - Add --ui-config (old --webui-config kept as deprecated alias) - Add --ui-config-file (old --webui-config-file kept as deprecated alias) - Add --ui-mcp-proxy/--no-ui-mcp-proxy (old --webui-mcp-proxy kept as deprecated) - Add new env vars: LLAMA_ARG_UI, LLAMA_ARG_UI_CONFIG, LLAMA_ARG_UI_CONFIG_FILE, LLAMA_ARG_UI_MCP_PROXY - C++ struct fields: params.ui, params.ui_config_json, params.ui_mcp_proxy added alongside old fields - Backward compat: old fields synced to new ones in g_params_to_internals * refactor: update C++ server internals with backward compat - Rename json_webui_settings -> json_ui_settings (both kept in server_context_meta) - Rename params.webui usage -> params.ui (both synced, old still works) - JSON API emits both "ui"/"ui_settings" and "webui"/"webui_settings" keys - Server routes use params.ui_mcp_proxy || params.webui_mcp_proxy - Preprocessor guards use #if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) * refactor: rename CI/CD workflows, artifacts, and build script - Rename webui-build.yml -> ui-build.yml; artifact webui-build -> ui-build - Rename webui-publish.yml -> ui-publish.yml; var HF_BUCKET_WEBUI_STATIC_OUTPUT -> HF_BUCKET_UI_STATIC_OUTPUT - Rename server-webui.yml -> server-ui.yml; job webui-build/checks -> ui-build/checks - Update server.yml: job/artifact refs webui-build -> ui-build - Update release.yml: all webui-build/publish refs -> ui-build/publish; HF_TOKEN_WEBUI_STATIC_OUTPUT -> HF_TOKEN_UI_STATIC_OUTPUT - Update server-self-hosted.yml: webui-build -> ui-build - Update build-self-hosted.yml: HF_WEBUI_VERSION -> HF_UI_VERSION - Rename webui-download.cmake -> ui-download.cmake (internal refs updated) - Update labeler.yml: server/webui -> server/ui path label * docs: update CODEOWNERS and server README docs - Update CODEOWNERS: team ggml-org/llama-webui -> ggml-org/llama-ui, path /tools/server/webui/ -> /tools/ui/ - Update server README.md: CLI tables show --ui flags with deprecated --webui aliases - Update server README-dev.md: "WebUI" -> "UI", paths updated to tools/ui/ * fix: Small fixes for UI build * fix: CMake.txt syntax * chore: Formatting * fix: `.editorconfig` for llama-ui * chore: Formatting * refactor: Use `APP_NAME` in Error route * refactor: Cleanup * refactor: Single migration service * make llama-ui a linkable target * fix: UI Build output * fix: Missing change * fix: separate llama-ui npm build output into build/tools/ui/dist subfolder + use cmake npm build instead of downloading ui-build.yml artifacts in CI * refactor: UI workflows cleanup --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
151 lines
5.2 KiB
C++
151 lines
5.2 KiB
C++
#pragma once
|
|
|
|
#include "server-http.h"
|
|
#include "server-task.h"
|
|
#include "server-queue.h"
|
|
|
|
#include <nlohmann/json_fwd.hpp>
|
|
|
|
#include <cstddef>
|
|
#include <memory>
|
|
#include <set>
|
|
|
|
struct server_context_impl; // private implementation
|
|
|
|
struct server_context_meta {
|
|
std::string build_info;
|
|
std::string model_name;
|
|
std::set<std::string> model_aliases;
|
|
std::set<std::string> model_tags;
|
|
std::string model_path;
|
|
bool has_mtmd;
|
|
bool has_inp_image;
|
|
bool has_inp_audio;
|
|
json json_ui_settings; // Primary: new name
|
|
json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat)
|
|
int slot_n_ctx;
|
|
enum llama_pooling_type pooling_type;
|
|
|
|
// chat params
|
|
server_chat_params & chat_params;
|
|
std::map<std::string, bool> chat_template_caps;
|
|
|
|
// tokens
|
|
std::string bos_token_str;
|
|
std::string eos_token_str;
|
|
llama_token fim_pre_token;
|
|
llama_token fim_sub_token;
|
|
llama_token fim_mid_token;
|
|
llama_token fim_pad_token;
|
|
llama_token fim_rep_token;
|
|
llama_token fim_sep_token;
|
|
|
|
// sampling
|
|
std::vector<llama_logit_bias> logit_bias_eog;
|
|
|
|
// model meta
|
|
enum llama_vocab_type model_vocab_type;
|
|
int32_t model_vocab_n_tokens;
|
|
int32_t model_n_ctx_train;
|
|
int32_t model_n_embd_inp;
|
|
uint64_t model_n_params;
|
|
uint64_t model_size;
|
|
};
|
|
|
|
struct server_context {
|
|
std::unique_ptr<server_context_impl> impl;
|
|
|
|
server_context();
|
|
~server_context();
|
|
|
|
// load the model and initialize llama_context
|
|
// returns true on success
|
|
bool load_model(common_params & params);
|
|
|
|
// this function will block main thread until termination
|
|
void start_loop();
|
|
|
|
// terminate main loop (will unblock start_loop)
|
|
void terminate();
|
|
|
|
// get the underlaying llama_context, can return nullptr if sleeping
|
|
// not thread-safe, should only be used from the main thread
|
|
llama_context * get_llama_context() const;
|
|
|
|
// get a new response reader, used by CLI application
|
|
server_response_reader get_response_reader();
|
|
|
|
// get server metadata (read-only), can only be called after load_model()
|
|
// not thread-safe, should only be used from the main thread
|
|
server_context_meta get_meta() const;
|
|
|
|
// register a callback to be called when sleeping state changes
|
|
// must be set before load_model() is called
|
|
void on_sleeping_changed(std::function<void(bool)> callback);
|
|
};
|
|
|
|
|
|
// forward declarations
|
|
struct server_res_generator;
|
|
|
|
struct server_routes {
|
|
server_routes(const common_params & params, server_context & ctx_server);
|
|
|
|
void init_routes();
|
|
|
|
// note: this is not thread-safe and can only when ctx_http.is_ready is false
|
|
void update_meta(const server_context & ctx_server) {
|
|
this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
|
|
}
|
|
|
|
// handlers using lambda function, so that they can capture `this` without `std::bind`
|
|
// they won't be called until ctx_http.is_ready is set to true
|
|
server_http_context::handler_t get_health;
|
|
server_http_context::handler_t get_metrics;
|
|
server_http_context::handler_t get_slots;
|
|
server_http_context::handler_t post_slots;
|
|
server_http_context::handler_t get_props;
|
|
server_http_context::handler_t post_props;
|
|
server_http_context::handler_t post_infill;
|
|
server_http_context::handler_t post_completions;
|
|
server_http_context::handler_t post_completions_oai;
|
|
server_http_context::handler_t post_chat_completions;
|
|
server_http_context::handler_t post_responses_oai;
|
|
server_http_context::handler_t post_transcriptions_oai;
|
|
server_http_context::handler_t post_anthropic_messages;
|
|
server_http_context::handler_t post_anthropic_count_tokens;
|
|
server_http_context::handler_t post_apply_template;
|
|
server_http_context::handler_t get_models;
|
|
server_http_context::handler_t post_tokenize;
|
|
server_http_context::handler_t post_detokenize;
|
|
server_http_context::handler_t post_embeddings;
|
|
server_http_context::handler_t post_embeddings_oai;
|
|
server_http_context::handler_t post_rerank;
|
|
server_http_context::handler_t get_lora_adapters;
|
|
server_http_context::handler_t post_lora_adapters;
|
|
|
|
// to be used in router mode
|
|
json get_model_info() const;
|
|
|
|
private:
|
|
std::unique_ptr<server_res_generator> handle_completions_impl(
|
|
const server_http_req & req,
|
|
server_task_type type,
|
|
const json & data,
|
|
const std::vector<raw_buffer> & files,
|
|
task_response_type res_type);
|
|
std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
|
|
std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
|
|
|
|
// using unique_ptr to allow late initialization of const
|
|
std::unique_ptr<const server_context_meta> meta;
|
|
|
|
const common_params & params;
|
|
const server_context_impl & ctx_server;
|
|
|
|
server_queue & queue_tasks;
|
|
server_response & queue_results;
|
|
std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
|
|
};
|