mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
Merge commit '12a4a47e6a' into concedo_experimental
# Conflicts: # ci/run.sh # examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh # examples/model-conversion/scripts/causal/run-converted-model.sh # examples/model-conversion/scripts/embedding/run-converted-model.sh # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # ggml/src/ggml-zendnn/ggml-zendnn.cpp # tests/CMakeLists.txt # tests/test-chat-parser.cpp # tests/test-chat-peg-parser.cpp # tests/test-chat.cpp # tools/cli/cli.cpp
This commit is contained in:
commit
4984c9bc16
37 changed files with 463 additions and 240 deletions
|
|
@ -129,7 +129,7 @@ static void parse_json_tool_calls(
|
|||
}
|
||||
}
|
||||
|
||||
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
|
||||
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
|
||||
: input_(input), is_partial_(is_partial), syntax_(syntax)
|
||||
{
|
||||
result_.role = "assistant";
|
||||
|
|
@ -1611,7 +1611,7 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|||
builder.finish();
|
||||
}
|
||||
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
|
||||
if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
|
||||
syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
|
||||
syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
|
||||
|
|
@ -1635,7 +1635,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|||
return msg;
|
||||
}
|
||||
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
|
||||
if (parser.empty()) {
|
||||
throw std::runtime_error("Failed to parse due to missing parser definition.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
#include "json-partial.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
|
@ -19,20 +19,20 @@ class common_chat_msg_partial_exception : public std::runtime_error {
|
|||
class common_chat_msg_parser {
|
||||
std::string input_;
|
||||
bool is_partial_;
|
||||
common_chat_syntax syntax_;
|
||||
common_chat_parser_params syntax_; // TODO: rename to params
|
||||
std::string healing_marker_;
|
||||
|
||||
size_t pos_ = 0;
|
||||
common_chat_msg result_;
|
||||
|
||||
public:
|
||||
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
|
||||
const std::string & input() const { return input_; }
|
||||
size_t pos() const { return pos_; }
|
||||
const std::string & healing_marker() const { return healing_marker_; }
|
||||
const bool & is_partial() const { return is_partial_; }
|
||||
const common_chat_msg & result() const { return result_; }
|
||||
const common_chat_syntax & syntax() const { return syntax_; }
|
||||
const common_chat_parser_params & syntax() const { return syntax_; }
|
||||
|
||||
void move_to(size_t pos) {
|
||||
if (pos > input_.size()) {
|
||||
|
|
|
|||
|
|
@ -615,18 +615,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
|
|||
return tmpls->has_explicit_template;
|
||||
}
|
||||
|
||||
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
|
||||
if (variant != nullptr) {
|
||||
if (strcmp(variant, "tool_use") == 0) {
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
|
||||
if (!variant.empty()) {
|
||||
if (variant == "tool_use") {
|
||||
if (tmpls->template_tool_use) {
|
||||
return tmpls->template_tool_use->source().c_str();
|
||||
return tmpls->template_tool_use->source();
|
||||
}
|
||||
return nullptr;
|
||||
return "";
|
||||
} else {
|
||||
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
|
||||
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
|
||||
}
|
||||
}
|
||||
return tmpls->template_default->source().c_str();
|
||||
return tmpls->template_default->source();
|
||||
}
|
||||
|
||||
common_chat_templates_ptr common_chat_templates_init(
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ struct common_chat_templates_inputs {
|
|||
std::vector<common_chat_tool> tools;
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
bool parallel_tool_calls = false;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
std::map<std::string, std::string> chat_template_kwargs;
|
||||
|
|
@ -165,14 +165,21 @@ struct common_chat_params {
|
|||
std::string parser;
|
||||
};
|
||||
|
||||
struct common_chat_syntax {
|
||||
// per-message parsing syntax
|
||||
// should be derived from common_chat_params
|
||||
struct common_chat_parser_params {
|
||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
|
||||
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
||||
bool reasoning_in_content = false;
|
||||
bool thinking_forced_open = false;
|
||||
bool parse_tool_calls = true;
|
||||
common_peg_arena parser = {};
|
||||
common_chat_parser_params() = default;
|
||||
common_chat_parser_params(const common_chat_params & chat_params) {
|
||||
format = chat_params.format;
|
||||
thinking_forced_open = chat_params.thinking_forced_open;
|
||||
}
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
|
|
@ -191,7 +198,7 @@ common_chat_templates_ptr common_chat_templates_init(
|
|||
const std::string & eos_token_override = "");
|
||||
|
||||
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
|
||||
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
|
||||
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
|
||||
|
||||
|
||||
struct common_chat_params common_chat_templates_apply(
|
||||
|
|
@ -213,10 +220,12 @@ std::string common_chat_format_example(
|
|||
const std::map<std::string, std::string> & chat_template_kwargs);
|
||||
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
|
||||
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
|
||||
|
||||
// used by arg and server
|
||||
const char * common_reasoning_format_name(common_reasoning_format format);
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
|
|
@ -53,6 +54,8 @@ using llama_tokens = std::vector<llama_token>;
|
|||
|
||||
// build info
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
//
|
||||
|
|
@ -280,6 +283,7 @@ struct common_params_diffusion {
|
|||
};
|
||||
|
||||
// reasoning API response format (not to be confused as chat template's reasoning format)
|
||||
// only used by server
|
||||
enum common_reasoning_format {
|
||||
COMMON_REASONING_FORMAT_NONE,
|
||||
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|
||||
|
|
|
|||
|
|
@ -314,23 +314,26 @@ static bool common_pull_file(httplib::Client & cli,
|
|||
|
||||
// download one single file from remote URL to local path
|
||||
// returns status code or -1 on error
|
||||
static int common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static int common_download_file_single_online(const std::string & url,
|
||||
const std::string & path,
|
||||
const std::string & bearer_token,
|
||||
const common_header_list & custom_headers) {
|
||||
static const int max_attempts = 3;
|
||||
static const int retry_delay_seconds = 2;
|
||||
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
|
||||
if (!bearer_token.empty()) {
|
||||
default_headers.insert({"Authorization", "Bearer " + bearer_token});
|
||||
}
|
||||
httplib::Headers headers;
|
||||
for (const auto & h : custom_headers) {
|
||||
default_headers.emplace(h.first, h.second);
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
cli.set_default_headers(default_headers);
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
}
|
||||
if (!bearer_token.empty()) {
|
||||
headers.emplace("Authorization", "Bearer " + bearer_token);
|
||||
}
|
||||
cli.set_default_headers(headers);
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
|
||||
|
|
@ -437,10 +440,12 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|||
const common_remote_params & params) {
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
|
||||
|
||||
for (const auto & header : params.headers) {
|
||||
headers.emplace(header.first, header.second);
|
||||
httplib::Headers headers;
|
||||
for (const auto & h : params.headers) {
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
|
|
|
|||
|
|
@ -805,7 +805,7 @@ value member_expression::execute_impl(context & ctx) {
|
|||
} else if (is_val<value_string>(property)) {
|
||||
auto key = property->as_string().str();
|
||||
JJ_DEBUG("Accessing %s built-in '%s'", is_val<value_array>(object) ? "array" : "string", key.c_str());
|
||||
val = try_builtin_func(ctx, key, object);
|
||||
val = try_builtin_func(ctx, key, object, true);
|
||||
} else {
|
||||
throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type());
|
||||
}
|
||||
|
|
@ -814,7 +814,7 @@ value member_expression::execute_impl(context & ctx) {
|
|||
throw std::runtime_error("Cannot access property with non-string: got " + property->type());
|
||||
}
|
||||
auto key = property->as_string().str();
|
||||
val = try_builtin_func(ctx, key, object);
|
||||
val = try_builtin_func(ctx, key, object, true);
|
||||
}
|
||||
|
||||
if (ctx.is_get_stats && val && object && property) {
|
||||
|
|
|
|||
|
|
@ -203,6 +203,9 @@ struct value_int_t : public value_t {
|
|||
virtual int64_t as_int() const override { return val_int; }
|
||||
virtual double as_float() const override { return static_cast<double>(val_int); }
|
||||
virtual string as_string() const override { return std::to_string(val_int); }
|
||||
virtual bool as_bool() const override {
|
||||
return val_int != 0;
|
||||
}
|
||||
virtual const func_builtins & get_builtins() const override;
|
||||
};
|
||||
using value_int = std::shared_ptr<value_int_t>;
|
||||
|
|
@ -219,6 +222,9 @@ struct value_float_t : public value_t {
|
|||
if (out.back() == '.') out.push_back('0'); // leave one zero if no decimals
|
||||
return out;
|
||||
}
|
||||
virtual bool as_bool() const override {
|
||||
return val_flt != 0.0;
|
||||
}
|
||||
virtual const func_builtins & get_builtins() const override;
|
||||
};
|
||||
using value_float = std::shared_ptr<value_float_t>;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
// TODO: use json_fwd.hpp when possible
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||
|
|
|
|||
|
|
@ -1078,6 +1078,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
|
||||
# ref: https://huggingface.co/aari1995/German_Semantic_V3
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
|
||||
# ref: https://huggingface.co/zai-org/GLM-4.7-Flash
|
||||
res = "glm4"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
|
|
@ -7458,7 +7461,7 @@ class DeepseekModel(TextModel):
|
|||
"DeepseekV3ForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
"YoutuVLForConditionalGeneration"
|
||||
"YoutuVLForConditionalGeneration",
|
||||
)
|
||||
class DeepseekV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
|
@ -8446,6 +8449,32 @@ class Glm4MoeModel(TextModel):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Glm4MoeLiteForCausalLM")
|
||||
class Glm4MoeLiteModel(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
||||
# copied from Glm4MoeModel
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# Special tokens
|
||||
# Note: Using <|endoftext|> (151329) for eot causes endless generation
|
||||
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
|
||||
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
|
||||
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
|
||||
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
class ChatGLMModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.CHATGLM
|
||||
|
|
@ -9183,7 +9212,7 @@ class NemotronHModel(GraniteHybridModel):
|
|||
return [(mapped_name, reshaped_data)]
|
||||
|
||||
if name.endswith("mixer.norm.weight"):
|
||||
reshaped_data = data_torch.reshape(8, 512)
|
||||
reshaped_data = data_torch.reshape(self.n_group, -1)
|
||||
mapped_name = self.map_tensor_name(name)
|
||||
return [(mapped_name, reshaped_data)]
|
||||
|
||||
|
|
|
|||
|
|
@ -170,6 +170,7 @@ pre_computed_hashes = [
|
|||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
# jina-v2-de variants
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -648,10 +648,11 @@ extern "C" {
|
|||
|
||||
// this tensor...
|
||||
enum ggml_tensor_flag {
|
||||
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
|
||||
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
|
||||
};
|
||||
|
||||
enum ggml_tri_type {
|
||||
|
|
@ -2602,11 +2603,42 @@ extern "C" {
|
|||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
|
||||
// build forward mutiple tensors and select one of them for computing
|
||||
// this is useful for creating graphs that have constant topology but compute different things based on the input
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
|
||||
//
|
||||
// automatic differentiation
|
||||
// nodes:
|
||||
// | - build forward into the graph but do not compute
|
||||
// c - build forward into the graph and compute
|
||||
//
|
||||
// | | ... c ... |
|
||||
// | | ... c ... |
|
||||
// | | ... c ... |
|
||||
// [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
|
||||
// c
|
||||
// c
|
||||
//
|
||||
// example:
|
||||
// struct ggml_tensor * curs[3];
|
||||
//
|
||||
// curs[0] = compute0(...);
|
||||
// curs[1] = compute1(...);
|
||||
// curs[2] = compute2(...);
|
||||
//
|
||||
// int idx = select_branch(some_input);
|
||||
//
|
||||
// struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
|
||||
//
|
||||
GGML_API struct ggml_tensor * ggml_build_forward_select(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** tensors,
|
||||
int n_tensors,
|
||||
int idx);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(
|
||||
struct ggml_context * ctx, // context for gradient computation
|
||||
struct ggml_cgraph * cgraph,
|
||||
|
|
@ -2638,7 +2670,7 @@ extern "C" {
|
|||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
// dump the graph into a file using the dot format
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
|
||||
|
||||
// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
|
||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
|
|
|||
|
|
@ -77,39 +77,23 @@
|
|||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
#if defined(__cpp_lib_char8_t)
|
||||
// C++20 and later: u8string() returns std::u8string
|
||||
std::u8string u8str = path.u8string();
|
||||
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
|
||||
const std::u8string u8str = path.u8string();
|
||||
return std::string(reinterpret_cast<const char *>(u8str.data()), u8str.size());
|
||||
#else
|
||||
// C++17: u8string() returns std::string
|
||||
u8path = path.u8string();
|
||||
return path.u8string();
|
||||
#endif
|
||||
} catch (...) {
|
||||
return std::string();
|
||||
}
|
||||
return u8path;
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
|
|
|||
|
|
@ -881,9 +881,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||
}
|
||||
if (sched->debug > 1) {
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
|
|
@ -1929,6 +1929,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
|||
dst->view_offs = src->view_offs;
|
||||
}
|
||||
dst->op = src->op;
|
||||
dst->flags = src->flags;
|
||||
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
||||
ggml_set_name(dst, src->name);
|
||||
|
||||
|
|
|
|||
|
|
@ -226,6 +226,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_backend_blas_mul_mat(ctx, node);
|
||||
|
|
|
|||
|
|
@ -3809,6 +3809,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/cub.cuh>
|
||||
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 1)
|
||||
# define STRIDED_ITERATOR_AVAILABLE
|
||||
# endif
|
||||
using namespace cub;
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
|
|
@ -14,12 +17,14 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef STRIDED_ITERATOR_AVAILABLE
|
||||
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
|
||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx <= nrows) {
|
||||
offsets[idx] = idx * ncols;
|
||||
}
|
||||
}
|
||||
#endif // STRIDED_ITERATOR_AVAILABLE
|
||||
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
||||
|
|
@ -31,19 +36,22 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
cudaStream_t stream) {
|
||||
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
|
||||
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
|
||||
|
||||
int * temp_indices = temp_indices_alloc.get();
|
||||
float * temp_keys = temp_keys_alloc.get();
|
||||
int * d_offsets = offsets_alloc.get();
|
||||
|
||||
static const int block_size = 256;
|
||||
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
|
||||
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
|
||||
|
||||
const dim3 offset_grid((nrows + block_size - 1) / block_size);
|
||||
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
|
||||
|
||||
#ifdef STRIDED_ITERATOR_AVAILABLE
|
||||
auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
|
||||
#else
|
||||
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
|
||||
int * offset_iterator = offsets_alloc.get();
|
||||
const dim3 offset_grid((nrows + block_size - 1) / block_size);
|
||||
init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows);
|
||||
#endif
|
||||
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
size_t temp_storage_bytes = 0;
|
||||
|
|
@ -57,7 +65,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
d_offsets, d_offsets + 1, stream);
|
||||
offset_iterator, offset_iterator + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
|
|
@ -66,7 +74,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
|
||||
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -80,7 +89,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
|
||||
ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
|
||||
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
|
|
@ -89,8 +98,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
ncols, 0, sizeof(float) * 8, stream);
|
||||
} else {
|
||||
DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
|
||||
stream);
|
||||
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
|
||||
offset_iterator + 1, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1130,6 +1130,7 @@ struct ggml_tensor_extra_gpu {
|
|||
struct ggml_cuda_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int32_t flags;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * src_address[GGML_MAX_SRC];
|
||||
|
|
|
|||
|
|
@ -2930,6 +2930,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
|||
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||
props->node_address = node->data;
|
||||
props->node_op = node->op;
|
||||
props->flags = node->flags;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
props->ne[i] = node->ne[i];
|
||||
props->nb[i] = node->nb[i];
|
||||
|
|
@ -2973,6 +2974,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
|
|||
return false;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -3390,6 +3395,9 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
|||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@
|
|||
#ifdef GGML_CUDA_USE_CUB
|
||||
# include <cub/cub.cuh>
|
||||
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
|
||||
# include <cuda/iterator>
|
||||
# define CUB_TOP_K_AVAILABLE
|
||||
using namespace cub;
|
||||
# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
|
||||
|
|
|
|||
|
|
@ -611,6 +611,9 @@ static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const in
|
|||
if (node->op != ops[i]) {
|
||||
return false;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1084,12 +1084,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
op->src[0]->ne[0] != 112 &&
|
||||
op->src[0]->ne[0] != 128 &&
|
||||
op->src[0]->ne[0] != 192 &&
|
||||
op->src[0]->ne[0] != 256) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek sizes
|
||||
// TODO: disabled for now, until optmized
|
||||
op->src[0]->ne[0] != 256 &&
|
||||
op->src[0]->ne[0] != 576) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
|
|
|
|||
|
|
@ -203,6 +203,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
|||
GGML_ABORT("unsupported op");
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
// check if the current node can run concurrently with other nodes before it
|
||||
|
|
@ -2516,7 +2520,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
|||
|
||||
// simdgroups per threadgroup (a.k.a. warps)
|
||||
//nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
|
||||
int32_t nsg = 4;
|
||||
int32_t nsg = ne00 >= 512 ? 8 : 4;
|
||||
|
||||
const size_t smem = FATTN_SMEM(nsg);
|
||||
|
||||
|
|
|
|||
|
|
@ -5552,9 +5552,7 @@ void kernel_flash_attn_ext_impl(
|
|||
|
||||
constexpr short NC = (C/8)/NSG;
|
||||
|
||||
// note: do not unroll for large heads
|
||||
#pragma unroll (DK <= 64 ? NC : 1)
|
||||
for (short cc = 0; cc < NC; ++cc) {
|
||||
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
|
||||
qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
|
||||
|
||||
if (DK % 16 != 0) {
|
||||
|
|
@ -5575,7 +5573,9 @@ void kernel_flash_attn_ext_impl(
|
|||
k8x8_t mk[2];
|
||||
q8x8_t mq[2];
|
||||
|
||||
FOR_UNROLL (short i = 0; i < DK8/2; ++i) {
|
||||
// note: too much unroll can tank the performance for large heads
|
||||
#pragma unroll (MIN(DK8/2, 4*NSG))
|
||||
for (short i = 0; i < DK8/2; ++i) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
simdgroup_load(mq[0], pq + 0*8 + 16*i, DK);
|
||||
|
|
@ -5749,7 +5749,9 @@ void kernel_flash_attn_ext_impl(
|
|||
pv += 8*NS20;
|
||||
}
|
||||
} else {
|
||||
FOR_UNROLL (short cc = 0; cc < (C/8)/2; ++cc) {
|
||||
constexpr short NC = (C/8)/2;
|
||||
|
||||
FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
|
||||
s8x8_t vs[2];
|
||||
|
||||
simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false);
|
||||
|
|
@ -5952,6 +5954,7 @@ kernel void kernel_flash_attn_ext(
|
|||
//case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
|
||||
//case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
|
||||
case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
|
||||
case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
|
||||
}
|
||||
#undef FWD_TMPL
|
||||
#undef FWD_ARGS
|
||||
|
|
|
|||
|
|
@ -12229,6 +12229,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
|
||||
return false;
|
||||
}
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||
ctx->semaphore_idx = 0;
|
||||
|
|
@ -13683,7 +13686,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
||||
while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3457,7 +3457,8 @@ struct ggml_tensor * ggml_cast(
|
|||
|
||||
result->op = GGML_OP_CPY;
|
||||
result->src[0] = a;
|
||||
result->src[1] = result;
|
||||
result->src[1] = result; // note: this self-reference might seem redundant, but it's actually needed by some
|
||||
// backends for consistency with ggml_cpy_impl() above
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
@ -6741,20 +6742,35 @@ static void ggml_compute_backward(
|
|||
GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
|
||||
}
|
||||
|
||||
static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
||||
// check if already visited
|
||||
size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
static size_t ggml_visit_parents_graph(struct ggml_cgraph * cgraph, struct ggml_tensor * node, bool compute) {
|
||||
if (node->op != GGML_OP_NONE && compute) {
|
||||
node->flags |= GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
|
||||
const size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
|
||||
if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
|
||||
// This is the first time we see this node in the current graph.
|
||||
cgraph->visited_hash_set.keys[node_hash_pos] = node;
|
||||
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
|
||||
cgraph->use_counts[node_hash_pos] = 0;
|
||||
} else {
|
||||
|
||||
if (ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
|
||||
// already visited
|
||||
|
||||
if (compute) {
|
||||
// update the compute flag regardless
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
struct ggml_tensor * src = node->src[i];
|
||||
if (src && ((src->flags & GGML_TENSOR_FLAG_COMPUTE) == 0)) {
|
||||
ggml_visit_parents_graph(cgraph, src, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return node_hash_pos;
|
||||
}
|
||||
|
||||
// This is the first time we see this node in the current graph.
|
||||
cgraph->visited_hash_set.keys[node_hash_pos] = node;
|
||||
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
|
||||
cgraph->use_counts[node_hash_pos] = 0;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
const int k =
|
||||
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
|
||||
|
|
@ -6763,7 +6779,7 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
|
|||
|
||||
struct ggml_tensor * src = node->src[k];
|
||||
if (src) {
|
||||
size_t src_hash_pos = ggml_visit_parents(cgraph, src);
|
||||
const size_t src_hash_pos = ggml_visit_parents_graph(cgraph, src, compute);
|
||||
|
||||
// Update the use count for this operand.
|
||||
cgraph->use_counts[src_hash_pos]++;
|
||||
|
|
@ -6794,17 +6810,17 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
|
|||
return node_hash_pos;
|
||||
}
|
||||
|
||||
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
|
||||
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand, bool compute) {
|
||||
if (!expand) {
|
||||
// TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
|
||||
ggml_graph_clear(cgraph);
|
||||
}
|
||||
|
||||
const int n0 = cgraph->n_nodes;
|
||||
const int n_old = cgraph->n_nodes;
|
||||
|
||||
ggml_visit_parents(cgraph, tensor);
|
||||
ggml_visit_parents_graph(cgraph, tensor, compute);
|
||||
|
||||
const int n_new = cgraph->n_nodes - n0;
|
||||
const int n_new = cgraph->n_nodes - n_old;
|
||||
GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
|
||||
|
||||
if (n_new > 0) {
|
||||
|
|
@ -6813,8 +6829,22 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_build_forward_select(
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** tensors,
|
||||
int n_tensors,
|
||||
int idx) {
|
||||
GGML_ASSERT(idx >= 0 && idx < n_tensors);
|
||||
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
ggml_build_forward_impl(cgraph, tensors[i], true, i == idx ? true : false);
|
||||
}
|
||||
|
||||
return tensors[idx];
|
||||
}
|
||||
|
||||
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
ggml_build_forward_impl(cgraph, tensor, true);
|
||||
ggml_build_forward_impl(cgraph, tensor, true, true);
|
||||
}
|
||||
|
||||
void ggml_build_backward_expand(
|
||||
|
|
@ -7245,6 +7275,10 @@ bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
|
|||
return false;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -7326,7 +7360,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
|
|||
label);
|
||||
}
|
||||
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename) {
|
||||
char color[16];
|
||||
|
||||
FILE * fp = ggml_fopen(filename, "w");
|
||||
|
|
@ -7347,7 +7381,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
snprintf(color, sizeof(color), "yellow");
|
||||
} else if (grad) {
|
||||
if (ggml_graph_find(gf, node)) {
|
||||
if (ggml_graph_find(cgraph, node)) {
|
||||
snprintf(color, sizeof(color), "green");
|
||||
} else {
|
||||
snprintf(color, sizeof(color), "lightblue");
|
||||
|
|
|
|||
|
|
@ -778,7 +778,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
FILE * file = ggml_fopen(fname, "rb");
|
||||
|
||||
if (!file) {
|
||||
GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname);
|
||||
GGML_LOG_ERROR("%s: failed to open GGUF file '%s' (%s)\n", __func__, fname, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -265,7 +265,8 @@ struct llama_file::impl {
|
|||
continue; // Interrupted by signal, retry
|
||||
}
|
||||
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
||||
if (errno == EFAULT) {
|
||||
if (errno == EFAULT || errno == EINVAL) {
|
||||
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
||||
auto curr_off = tell();
|
||||
close(fd);
|
||||
fd = -1;
|
||||
|
|
@ -384,6 +385,9 @@ int llama_file::file_id() const {
|
|||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
#else
|
||||
if (pimpl->fd != -1) {
|
||||
return pimpl->fd;
|
||||
}
|
||||
#if defined(fileno)
|
||||
return fileno(pimpl->fp);
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -543,12 +543,18 @@ llama_model_loader::llama_model_loader(
|
|||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
use_direct_io = use_direct_io && files.back()->has_direct_io();
|
||||
|
||||
// Disable mmap in case Direct I/O is enabled and available
|
||||
if (use_direct_io && use_mmap) {
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
// Disable mmap, as DirectIO is available
|
||||
use_mmap = false;
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
} else {
|
||||
// Disable DirectIO and reopen file using std::fopen for mmap
|
||||
use_direct_io = false;
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
|
|
|
|||
|
|
@ -1823,7 +1823,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
|
||||
// GLM 4.7 Lite
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
||||
} else {
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
}
|
||||
|
||||
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
||||
|
|
|
|||
|
|
@ -191,6 +191,84 @@ static void test_conditionals(testing & t) {
|
|||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
json::object(),
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined attribute falsy",
|
||||
"{{ 'yes' if not y.x else 'no' }}",
|
||||
{{"y", true}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is undefined key falsy",
|
||||
"{{ 'yes' if not y['x'] else 'no' }}",
|
||||
{{"y", {{}}}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty array falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", json::array()}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty object falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", json::object()}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is empty string falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", ""}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 0 falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", 0}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 0.0 falsy",
|
||||
"{{ 'yes' if not y else 'no' }}",
|
||||
{{"y", 0.0}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty array truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", json::array({""})}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty object truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", {"x", false}}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is non-empty string truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", "0"}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 1 truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", 1}},
|
||||
"yes"
|
||||
);
|
||||
|
||||
test_template(t, "is 1.0 truthy",
|
||||
"{{ 'yes' if y else 'no' }}",
|
||||
{{"y", 1.0}},
|
||||
"yes"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_loops(testing & t) {
|
||||
|
|
|
|||
|
|
@ -779,7 +779,6 @@ static void handle_media(
|
|||
// download remote image
|
||||
// TODO @ngxson : maybe make these params configurable
|
||||
common_remote_params params;
|
||||
params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
|
||||
params.max_size = 1024 * 1024 * 10; // 10MB
|
||||
params.timeout = 10; // seconds
|
||||
SRV_INF("downloading image from '%s'\n", url.c_str());
|
||||
|
|
@ -831,7 +830,7 @@ static void handle_media(
|
|||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
const oaicompat_parser_options & opt,
|
||||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files)
|
||||
{
|
||||
json llama_params;
|
||||
|
|
@ -1012,7 +1011,7 @@ json oaicompat_chat_params_parse(
|
|||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
|
||||
|
||||
/* Append assistant prefilled message */
|
||||
if (prefill_assistant_message) {
|
||||
|
|
|
|||
|
|
@ -13,8 +13,6 @@
|
|||
#include <vector>
|
||||
#include <cinttypes>
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
|
|
@ -274,25 +272,26 @@ std::vector<server_tokens> tokenize_input_prompts(
|
|||
// OAI utils
|
||||
//
|
||||
|
||||
// used by /completions endpoint
|
||||
json oaicompat_completion_params_parse(const json & body);
|
||||
|
||||
struct oaicompat_parser_options {
|
||||
// global server parameters for chat formatting / parsing
|
||||
struct server_chat_params {
|
||||
bool use_jinja;
|
||||
bool prefill_assistant;
|
||||
common_reasoning_format reasoning_format;
|
||||
std::map<std::string,std::string> chat_template_kwargs;
|
||||
common_chat_templates * tmpls;
|
||||
std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
|
||||
common_chat_templates_ptr tmpls;
|
||||
bool allow_image;
|
||||
bool allow_audio;
|
||||
bool enable_thinking = true;
|
||||
std::string media_path;
|
||||
};
|
||||
|
||||
// used by /completions endpoint
|
||||
json oaicompat_completion_params_parse(const json & body);
|
||||
|
||||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
const oaicompat_parser_options & opt,
|
||||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
|
|
|
|||
|
|
@ -534,8 +534,8 @@ public:
|
|||
server_queue queue_tasks;
|
||||
server_response queue_results;
|
||||
|
||||
common_chat_templates_ptr chat_templates;
|
||||
oaicompat_parser_options oai_parser_opt;
|
||||
// note: chat_params must not be refreshed upon existing sleeping state
|
||||
server_chat_params chat_params;
|
||||
|
||||
~server_context_impl() {
|
||||
if (!sleeping) {
|
||||
|
|
@ -688,15 +688,6 @@ private:
|
|||
llama_init_dft->free_context();
|
||||
}
|
||||
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
try {
|
||||
common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
||||
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
chat_templates = common_chat_templates_init(model, "chatml");
|
||||
}
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
if (!is_resume) {
|
||||
|
|
@ -845,30 +836,6 @@ private:
|
|||
model_name = model_path.filename().string();
|
||||
}
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
SRV_INF("thinking = %d\n", enable_thinking);
|
||||
|
||||
oai_parser_opt = {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* common_chat_templates */ chat_templates.get(),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* media_path */ params_base.media_path,
|
||||
};
|
||||
|
||||
// print sample chat example to make it clear which template is used
|
||||
// @ngxson modern templates are too long, spam the logs; printing the example is enough
|
||||
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
||||
// common_chat_templates_source(chat_templates.get()),
|
||||
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
||||
|
||||
if (!is_resume) {
|
||||
return init();
|
||||
}
|
||||
|
|
@ -907,6 +874,42 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
// populate chat template params
|
||||
{
|
||||
common_chat_templates_ptr chat_templates;
|
||||
|
||||
try {
|
||||
chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
||||
|
||||
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
||||
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
|
||||
SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
|
||||
SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
||||
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
|
||||
|
||||
chat_params = {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* tmpls */ std::move(chat_templates),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* media_path */ params_base.media_path,
|
||||
};
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1326,11 +1329,12 @@ private:
|
|||
}
|
||||
|
||||
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
|
||||
const size_t n_probs = slot.task->params.sampling.n_probs;
|
||||
const size_t n_probs_request = slot.task->params.sampling.n_probs;
|
||||
|
||||
if (post_sampling) {
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
|
||||
const size_t max_probs = cur_p->size;
|
||||
const size_t n_probs = std::min(max_probs, n_probs_request);
|
||||
|
||||
// set probability for sampled token
|
||||
for (size_t i = 0; i < max_probs; i++) {
|
||||
|
|
@ -1341,8 +1345,8 @@ private:
|
|||
}
|
||||
|
||||
// set probability for top n_probs tokens
|
||||
result.probs.reserve(max_probs);
|
||||
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
||||
result.probs.reserve(n_probs);
|
||||
for (size_t i = 0; i < n_probs; i++) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
common_token_to_piece(ctx, cur_p->data[i].id, special),
|
||||
|
|
@ -1352,9 +1356,11 @@ private:
|
|||
} else {
|
||||
// TODO: optimize this with min-p optimization
|
||||
std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
|
||||
const size_t max_probs = cur.size();
|
||||
const size_t n_probs = std::min(max_probs, n_probs_request);
|
||||
|
||||
// set probability for sampled token
|
||||
for (size_t i = 0; i < cur.size(); i++) {
|
||||
for (size_t i = 0; i < max_probs; i++) {
|
||||
// set probability for sampled token
|
||||
if (cur[i].id == result.tok) {
|
||||
result.prob = cur[i].p;
|
||||
|
|
@ -1364,7 +1370,7 @@ private:
|
|||
|
||||
// set probability for top n_probs tokens
|
||||
result.probs.reserve(n_probs);
|
||||
for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) {
|
||||
for (size_t i = 0; i < n_probs; i++) {
|
||||
result.probs.push_back({
|
||||
cur[i].id,
|
||||
common_token_to_piece(ctx, cur[i].id, special),
|
||||
|
|
@ -1585,32 +1591,14 @@ private:
|
|||
|
||||
// tokenize the input if it's set by CLI, return false on error
|
||||
bool tokenize_cli_input(server_task & task) {
|
||||
GGML_ASSERT(task.cli_input != nullptr);
|
||||
try {
|
||||
auto & opt = oai_parser_opt;
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
|
||||
inputs.tools = {}; // TODO
|
||||
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||
|
||||
// tokenize the resulting prompt
|
||||
auto & prompt = chat_params.prompt;
|
||||
auto & prompt = task.cli_prompt;
|
||||
if (mctx != nullptr) {
|
||||
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
|
||||
} else {
|
||||
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
|
||||
}
|
||||
task.cli_input.clear();
|
||||
task.cli_prompt.clear();
|
||||
task.cli_files.clear();
|
||||
} catch (const std::exception & e) {
|
||||
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
||||
|
|
@ -1686,7 +1674,7 @@ private:
|
|||
{
|
||||
// special case: if input is provided via CLI, tokenize it first
|
||||
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
||||
if (task.cli_input != nullptr) {
|
||||
if (task.cli) {
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
|
|
@ -2898,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
|
|||
}
|
||||
|
||||
server_context_meta server_context::get_meta() const {
|
||||
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
|
||||
|
||||
auto bos_id = llama_vocab_bos(impl->vocab);
|
||||
auto eos_id = llama_vocab_eos(impl->vocab);
|
||||
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
|
||||
|
|
@ -2910,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
|
|||
/* model_name */ impl->model_name,
|
||||
/* model_path */ impl->params_base.model.path,
|
||||
/* has_mtmd */ impl->mctx != nullptr,
|
||||
/* has_inp_image */ impl->oai_parser_opt.allow_image,
|
||||
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
|
||||
/* has_inp_image */ impl->chat_params.allow_image,
|
||||
/* has_inp_audio */ impl->chat_params.allow_audio,
|
||||
/* json_webui_settings */ impl->json_webui_settings,
|
||||
/* slot_n_ctx */ impl->get_slot_n_ctx(),
|
||||
/* pooling_type */ llama_pooling_type(impl->ctx),
|
||||
|
||||
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
|
||||
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
|
||||
/* chat_params */ impl->chat_params,
|
||||
|
||||
/* bos_token_str */ bos_token_str,
|
||||
/* eos_token_str */ eos_token_str,
|
||||
|
|
@ -3199,8 +3184,8 @@ void server_routes::init_routes() {
|
|||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
res->ok({{"status", "ok"}});
|
||||
return res;
|
||||
|
|
@ -3390,8 +3375,8 @@ void server_routes::init_routes() {
|
|||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
task_params tparams;
|
||||
tparams.sampling = params.sampling;
|
||||
|
|
@ -3400,6 +3385,9 @@ void server_routes::init_routes() {
|
|||
{ "n_ctx", meta->slot_n_ctx },
|
||||
};
|
||||
|
||||
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
||||
std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
|
||||
|
||||
json props = {
|
||||
{ "default_generation_settings", default_generation_settings_for_props },
|
||||
{ "total_slots", params.n_parallel },
|
||||
|
|
@ -3414,15 +3402,15 @@ void server_routes::init_routes() {
|
|||
{ "endpoint_metrics", params.endpoint_metrics },
|
||||
{ "webui", params.webui },
|
||||
{ "webui_settings", meta->json_webui_settings },
|
||||
{ "chat_template", meta->chat_template },
|
||||
{ "chat_template", tmpl_default },
|
||||
{ "bos_token", meta->bos_token_str },
|
||||
{ "eos_token", meta->eos_token_str },
|
||||
{ "build_info", meta->build_info },
|
||||
{ "is_sleeping", queue_tasks.is_sleeping() },
|
||||
};
|
||||
if (params.use_jinja) {
|
||||
if (!meta->chat_template_tool_use.empty()) {
|
||||
props["chat_template_tool_use"] = meta->chat_template_tool_use;
|
||||
if (!tmpl_tools.empty()) {
|
||||
props["chat_template_tool_use"] = tmpl_tools;
|
||||
}
|
||||
}
|
||||
res->ok(props);
|
||||
|
|
@ -3443,6 +3431,7 @@ void server_routes::init_routes() {
|
|||
|
||||
this->get_api_show = [this](const server_http_req &) {
|
||||
auto res = create_response();
|
||||
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
||||
json data = {
|
||||
{
|
||||
"model_info", {
|
||||
|
|
@ -3451,7 +3440,7 @@ void server_routes::init_routes() {
|
|||
},
|
||||
{"modelfile", ""},
|
||||
{"parameters", ""},
|
||||
{"template", meta->chat_template},
|
||||
{"template", tmpl_default},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
|
|
@ -3576,7 +3565,7 @@ void server_routes::init_routes() {
|
|||
json body = json::parse(req.body);
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
|
|
@ -3592,7 +3581,7 @@ void server_routes::init_routes() {
|
|||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
return handle_completions_impl(
|
||||
req,
|
||||
|
|
@ -3608,7 +3597,7 @@ void server_routes::init_routes() {
|
|||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
|
||||
json prompt = body_parsed.at("prompt");
|
||||
|
|
@ -3624,7 +3613,7 @@ void server_routes::init_routes() {
|
|||
json body = json::parse(req.body);
|
||||
json data = oaicompat_chat_params_parse(
|
||||
body,
|
||||
ctx_server.oai_parser_opt,
|
||||
meta->chat_params,
|
||||
files);
|
||||
res->ok({{ "prompt", std::move(data.at("prompt")) }});
|
||||
return res;
|
||||
|
|
@ -3635,8 +3624,8 @@ void server_routes::init_routes() {
|
|||
|
||||
// this endpoint can be accessed during sleeping
|
||||
// the next LOC is to avoid someone accidentally use ctx_server
|
||||
bool server_ctx; // do NOT delete this line
|
||||
GGML_UNUSED(server_ctx);
|
||||
bool ctx_server; // do NOT delete this line
|
||||
GGML_UNUSED(ctx_server);
|
||||
|
||||
json models = {
|
||||
{"models", {
|
||||
|
|
|
|||
|
|
@ -20,9 +20,8 @@ struct server_context_meta {
|
|||
int slot_n_ctx;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
// chat template
|
||||
std::string chat_template;
|
||||
std::string chat_template_tool_use;
|
||||
// chat params
|
||||
server_chat_params & chat_params;
|
||||
|
||||
// tokens
|
||||
std::string bos_token_str;
|
||||
|
|
|
|||
|
|
@ -68,10 +68,10 @@ json task_params::to_json(bool only_metrics) const {
|
|||
{"stream", stream},
|
||||
{"n_probs", sampling.n_probs},
|
||||
{"min_keep", sampling.min_keep},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
|
||||
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
||||
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
||||
{"chat_format", common_chat_format_name(chat_parser_params.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
|
||||
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
|
||||
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
|
|
@ -127,10 +127,10 @@ json task_params::to_json(bool only_metrics) const {
|
|||
{"grammar_lazy", sampling.grammar_lazy},
|
||||
{"grammar_triggers", grammar_triggers},
|
||||
{"preserved_tokens", sampling.preserved_tokens},
|
||||
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
|
||||
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
||||
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
||||
{"chat_format", common_chat_format_name(chat_parser_params.format)},
|
||||
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
|
||||
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
|
||||
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
|
||||
{"samplers", samplers},
|
||||
{"speculative.n_max", speculative.n_max},
|
||||
{"speculative.n_min", speculative.n_min},
|
||||
|
|
@ -291,21 +291,21 @@ task_params server_task::params_from_json_cmpl(
|
|||
{
|
||||
auto it = data.find("chat_format");
|
||||
if (it != data.end()) {
|
||||
params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
|
||||
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
|
||||
params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
|
||||
SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
|
||||
} else {
|
||||
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
|
||||
params.chat_parser_params.format = defaults.chat_parser_params.format;
|
||||
}
|
||||
common_reasoning_format reasoning_format = params_base.reasoning_format;
|
||||
if (data.contains("reasoning_format")) {
|
||||
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
|
||||
}
|
||||
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
|
||||
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
|
||||
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
|
||||
params.chat_parser_params.reasoning_format = reasoning_format;
|
||||
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
|
||||
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
|
||||
if (data.contains("chat_parser")) {
|
||||
params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get<std::string>());
|
||||
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -722,7 +722,7 @@ common_chat_msg task_result_state::update_chat_msg(
|
|||
auto new_msg = common_chat_parse(
|
||||
generated_text,
|
||||
is_partial,
|
||||
oaicompat_chat_syntax);
|
||||
chat_parser_params);
|
||||
if (!new_msg.empty()) {
|
||||
new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
|
||||
chat_msg = new_msg;
|
||||
|
|
|
|||
|
|
@ -78,7 +78,9 @@ struct task_params {
|
|||
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
|
||||
// per-request parameters for chat parsing
|
||||
common_chat_parser_params chat_parser_params;
|
||||
|
||||
// Embeddings
|
||||
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
|
|
@ -91,7 +93,7 @@ struct task_params {
|
|||
struct task_result_state {
|
||||
// tracking diffs for partial tool calls
|
||||
std::vector<common_chat_msg_diff> diffs;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
common_chat_parser_params chat_parser_params;
|
||||
common_chat_msg chat_msg;
|
||||
std::string generated_text; // append new chunks of generated text here
|
||||
std::vector<std::string> generated_tool_call_ids;
|
||||
|
|
@ -100,8 +102,8 @@ struct task_result_state {
|
|||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
|
||||
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
|
||||
task_result_state(const common_chat_parser_params & chat_parser_params)
|
||||
: chat_parser_params(chat_parser_params) {}
|
||||
|
||||
// parse partial tool calls and update the internal state
|
||||
common_chat_msg update_chat_msg(
|
||||
|
|
@ -130,8 +132,10 @@ struct server_task {
|
|||
task_params params;
|
||||
server_tokens tokens;
|
||||
|
||||
// only used by CLI, this delegates the tokenization to the server
|
||||
json cli_input = nullptr;
|
||||
// only used by CLI, this allow tokenizing CLI inputs on server side
|
||||
// we need this because mtmd_context and vocab are not accessible outside of server_context
|
||||
bool cli = false;
|
||||
std::string cli_prompt;
|
||||
std::vector<raw_buffer> cli_files;
|
||||
|
||||
server_task_type type;
|
||||
|
|
@ -228,7 +232,7 @@ struct server_task {
|
|||
// the task will be moved into queue, then onto slots
|
||||
// however, the state must be kept by caller (e.g., HTTP thread)
|
||||
task_result_state create_state() const {
|
||||
return task_result_state(params.oaicompat_chat_syntax);
|
||||
return task_result_state(params.chat_parser_params);
|
||||
}
|
||||
|
||||
bool is_parent() const {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue