mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/openvino.Dockerfile # .github/workflows/build-self-hosted.yml # .github/workflows/build.yml # common/chat.cpp # docs/backend/OPENVINO.md # examples/speculative-simple/speculative-simple.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/libggml-htp.inf # ggml/src/ggml-openvino/ggml-decoder.cpp # ggml/src/ggml-openvino/ggml-openvino-extra.cpp # ggml/src/ggml-openvino/ggml-openvino.cpp # ggml/src/ggml-openvino/ggml-quants.cpp # ggml/src/ggml-openvino/openvino/op/rope.cpp # ggml/src/ggml-openvino/openvino/op_table.cpp # ggml/src/ggml-openvino/openvino/op_table.h # ggml/src/ggml-openvino/openvino/translate_session.cpp # ggml/src/ggml-openvino/openvino/utils.cpp # ggml/src/ggml-openvino/openvino/utils.h # ggml/src/ggml-openvino/utils.cpp # ggml/src/ggml-openvino/utils.h # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/convert.hpp # ggml/src/ggml-sycl/gemm.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/set_rows.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/CMakeLists.txt # tests/test-chat.cpp # tools/cli/cli.cpp # tools/mtmd/CMakeLists.txt # tools/server/CMakeLists.txt
This commit is contained in:
commit
0755f27372
42 changed files with 1531 additions and 3199 deletions
|
|
@ -150,7 +150,7 @@
|
|||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// hunyuanocr
|
||||
// hunyuanocr / hunyuanvl (shared GGUF tensor names)
|
||||
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
|
||||
#define TN_TOK_IMG_BEGIN "mm.image_begin"
|
||||
#define TN_TOK_IMG_END "mm.image_end"
|
||||
|
|
@ -242,6 +242,15 @@
|
|||
#define TN_STD_BIAS "v.std_bias"
|
||||
#define TN_STD_SCALE "v.std_scale"
|
||||
|
||||
// yasa2
|
||||
#define TN_YASA_PATCH_LN_W "v.patch_ln.weight"
|
||||
#define TN_YASA_PATCH_LN_B "v.patch_ln.bias"
|
||||
#define TN_YASA_BACKBONE_LN_W "v.backbone_ln.weight"
|
||||
#define TN_YASA_BACKBONE_LN_B "v.backbone_ln.bias"
|
||||
#define TN_YASA_POS_EMBD "v.vision_pos_embed"
|
||||
#define TN_YASA_STAGE_DOWN_LN "v.stage.%d.down.ln.%s"
|
||||
#define TN_YASA_STAGE_DOWN_CONV "v.stage.%d.down.conv.%s"
|
||||
#define TN_YASA_STAGE_BLK "v.stage.%d.blk.%d.%s.%s"
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
|
@ -290,9 +299,11 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_YASA2,
|
||||
PROJECTOR_TYPE_KIMIK25,
|
||||
PROJECTOR_TYPE_NEMOTRON_V2_VL,
|
||||
PROJECTOR_TYPE_HUNYUANOCR,
|
||||
PROJECTOR_TYPE_HUNYUANVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -335,9 +346,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
{ PROJECTOR_TYPE_YASA2, "yasa2"},
|
||||
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
|
||||
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
|
||||
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
|
||||
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
|
|||
|
|
@ -268,6 +268,27 @@ struct mobilenetv5_block {
|
|||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct yasa2_block {
|
||||
ggml_tensor * dw_w = nullptr;
|
||||
ggml_tensor * dw_b = nullptr;
|
||||
ggml_tensor * ln_w = nullptr;
|
||||
ggml_tensor * ln_b = nullptr;
|
||||
ggml_tensor * pw1_w = nullptr;
|
||||
ggml_tensor * pw1_b = nullptr;
|
||||
ggml_tensor * grn_w = nullptr;
|
||||
ggml_tensor * grn_b = nullptr;
|
||||
ggml_tensor * pw2_w = nullptr;
|
||||
ggml_tensor * pw2_b = nullptr;
|
||||
};
|
||||
|
||||
struct yasa2_stage {
|
||||
ggml_tensor * down_ln_w = nullptr;
|
||||
ggml_tensor * down_ln_b = nullptr;
|
||||
ggml_tensor * down_conv_w = nullptr;
|
||||
ggml_tensor * down_conv_b = nullptr;
|
||||
std::vector<yasa2_block> blocks;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
|
@ -402,6 +423,15 @@ struct clip_model {
|
|||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
// yasa2
|
||||
ggml_tensor * yasa_patch_w = nullptr;
|
||||
ggml_tensor * yasa_patch_b = nullptr;
|
||||
ggml_tensor * yasa_patch_ln_w = nullptr;
|
||||
ggml_tensor * yasa_patch_ln_b = nullptr;
|
||||
ggml_tensor * yasa_backbone_ln_w = nullptr;
|
||||
ggml_tensor * yasa_backbone_ln_b = nullptr;
|
||||
ggml_tensor * yasa_vision_pos_embed = nullptr;
|
||||
std::vector<yasa2_stage> yasa_stages;
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
|
|
|
|||
|
|
@ -76,6 +76,7 @@
|
|||
#include "models/deepseekocr.cpp"
|
||||
#include "models/mobilenetv5.cpp"
|
||||
#include "models/youtuvl.cpp"
|
||||
#include "models/yasa2.cpp"
|
||||
|
||||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
|
|
@ -969,6 +970,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
|
||||
} break;
|
||||
|
|
@ -1004,6 +1006,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_yasa2>(ctx, img);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("missing cgraph builder");
|
||||
}
|
||||
|
|
@ -1474,6 +1480,16 @@ struct clip_model_loader {
|
|||
hparams.set_limit_image_tokens(1, 62500);
|
||||
hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
hparams.ffn_op = FFN_GELU_ERF;
|
||||
log_ffn_op = "gelu_erf";
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||
|
||||
// reka model performs better when using resize_bicubic, which stretches
|
||||
// the image to fit fixed square size
|
||||
hparams.image_resize_pad = false;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
|
|
@ -1544,6 +1560,16 @@ struct clip_model_loader {
|
|||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(28*28);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
hparams.image_resize_pad = false;
|
||||
hparams.ffn_op = FFN_GELU;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
hparams.set_limit_image_tokens(256, 16384);
|
||||
hparams.set_warmup_n_tokens(32*32);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
|
|
@ -1929,6 +1955,55 @@ struct clip_model_loader {
|
|||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
// reuse tensors already loaded by the common section
|
||||
// (TN_PATCH_EMBD and TN_PATCH_BIAS have the same tensor names)
|
||||
GGML_ASSERT(model.patch_embeddings_0 && "yasa2 requires v.patch_embd.weight");
|
||||
model.yasa_patch_w = model.patch_embeddings_0;
|
||||
model.yasa_patch_b = model.patch_bias;
|
||||
model.yasa_patch_ln_w = get_tensor(TN_YASA_PATCH_LN_W, false);
|
||||
model.yasa_patch_ln_b = get_tensor(TN_YASA_PATCH_LN_B, false);
|
||||
model.yasa_backbone_ln_w = get_tensor(TN_YASA_BACKBONE_LN_W, false);
|
||||
model.yasa_backbone_ln_b = get_tensor(TN_YASA_BACKBONE_LN_B, false);
|
||||
model.yasa_vision_pos_embed = get_tensor(TN_YASA_POS_EMBD, false);
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
||||
|
||||
model.yasa_stages.clear();
|
||||
for (int s = 0; ; ++s) {
|
||||
yasa2_stage stage;
|
||||
stage.down_ln_w = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "weight"), false);
|
||||
stage.down_ln_b = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "bias"), false);
|
||||
stage.down_conv_w = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "weight"), false);
|
||||
stage.down_conv_b = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "bias"), false);
|
||||
|
||||
for (int bi = 0; ; ++bi) {
|
||||
yasa2_block blk;
|
||||
blk.dw_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "weight"), false);
|
||||
if (!blk.dw_w) {
|
||||
break;
|
||||
}
|
||||
blk.dw_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "bias"), false);
|
||||
blk.ln_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "weight"), false);
|
||||
blk.ln_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "bias"), false);
|
||||
blk.pw1_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "weight"), false);
|
||||
blk.pw1_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "bias"), false);
|
||||
blk.grn_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "weight"), false);
|
||||
blk.grn_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "bias"), false);
|
||||
blk.pw2_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "weight"), false);
|
||||
blk.pw2_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "bias"), false);
|
||||
stage.blocks.push_back(blk);
|
||||
}
|
||||
|
||||
if (!stage.down_conv_w && stage.blocks.empty()) {
|
||||
break;
|
||||
}
|
||||
model.yasa_stages.push_back(std::move(stage));
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
|
|
@ -2249,6 +2324,7 @@ struct clip_model_loader {
|
|||
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
|
|
@ -3062,6 +3138,19 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) {
|
|||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
img_beg = "<image>";
|
||||
img_end = "</image>";
|
||||
// Currently only supprots single-tile preprocessing: any input is downscaled
|
||||
// to one image_size x image_size tile (64 output tokens via 8x8 adaptive avg
|
||||
// pool).
|
||||
// However, the model itself supports llava-uhd multi-tile tiling for high-res
|
||||
// images. This will be implemented in a future PR (dispatch on has_pinpoints
|
||||
// - see LDP/COGVLM branch above) and emit image_grid_pinpoints in the conversion
|
||||
// script.
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
|
|
@ -3199,6 +3288,7 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) {
|
|||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
img_beg = "<|hy_place▁holder▁no▁100|>";
|
||||
|
|
@ -3287,6 +3377,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
|
|
@ -3306,6 +3397,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
|
|
@ -3333,6 +3425,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
{
|
||||
// do nothing
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
n_patches = 64; // adaptive average pooling to 8x8 tokens
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
case PROJECTOR_TYPE_LDPV2:
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
|
|
@ -3493,6 +3589,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches = h * (h + 1) + 1;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
|
|
@ -3953,9 +4050,74 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
case PROJECTOR_TYPE_PHI4:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// Compute the HunyuanVL 2D position embedding on CPU (with the
|
||||
// custom sf=(target+0.1)/n_grid bilinear sampling that the
|
||||
// reference implementation uses) and upload it to the graph
|
||||
// input declared in clip_graph_hunyuanocr::build().
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
ggml_tensor * src_t = model.position_embeddings;
|
||||
const int64_t n_embd = src_t->ne[0];
|
||||
const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid
|
||||
const int n_grid = (int)std::lround(std::sqrt((double)n_pos));
|
||||
GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
|
||||
const int out_w = pos_w; // pw
|
||||
const int out_h = pos_h; // ph
|
||||
|
||||
// Pull weight to host.
|
||||
std::vector<float> src(n_embd * n_pos);
|
||||
ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
|
||||
|
||||
// Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
|
||||
// ne[0] = n_embd (fastest), ne[1] = out_h*out_w
|
||||
// dst[(y*out_w + x) * n_embd + c]
|
||||
std::vector<float> dst((size_t)n_embd * out_h * out_w);
|
||||
|
||||
const float sx = (float)(out_w + 0.1f) / (float)n_grid;
|
||||
const float sy = (float)(out_h + 0.1f) / (float)n_grid;
|
||||
|
||||
for (int y = 0; y < out_h; ++y) {
|
||||
// Match ggml_compute_forward_upscale_f32 pixel-center
|
||||
// convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
|
||||
const float fy = ((float)y + 0.5f) / sy - 0.5f;
|
||||
int y0 = (int)std::floor(fy);
|
||||
int y1 = y0 + 1;
|
||||
y0 = std::clamp(y0, 0, n_grid - 1);
|
||||
y1 = std::clamp(y1, 0, n_grid - 1);
|
||||
float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
|
||||
const float wy0 = 1.0f - wy1;
|
||||
for (int x = 0; x < out_w; ++x) {
|
||||
const float fx = ((float)x + 0.5f) / sx - 0.5f;
|
||||
int x0 = (int)std::floor(fx);
|
||||
int x1 = x0 + 1;
|
||||
x0 = std::clamp(x0, 0, n_grid - 1);
|
||||
x1 = std::clamp(x1, 0, n_grid - 1);
|
||||
float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
|
||||
const float wx0 = 1.0f - wx1;
|
||||
|
||||
const float w00 = wy0 * wx0;
|
||||
const float w01 = wy0 * wx1;
|
||||
const float w10 = wy1 * wx0;
|
||||
const float w11 = wy1 * wx1;
|
||||
|
||||
const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
|
||||
const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
|
||||
const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
|
||||
const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
|
||||
float * d = &dst[((size_t)y * out_w + x) * n_embd];
|
||||
for (int c = 0; c < n_embd; ++c) {
|
||||
d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_f32("hunyuanvl_pos_embd", dst);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// set the 2D positions
|
||||
|
|
@ -4376,8 +4538,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
|
|
|
|||
|
|
@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() {
|
|||
const int pw = n_patches_x;
|
||||
const int ph = n_patches_y;
|
||||
|
||||
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
// Position embedding interpolation.
|
||||
// HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
|
||||
// ggml_interpolate cannot express. To avoid adding a new ggml op, the
|
||||
// resize is computed on CPU in clip_image_batch_encode and uploaded here
|
||||
// as a graph input (named "hunyuanvl_pos_embd").
|
||||
// HunyuanOCR uses the same square layout and the standard ratio-based
|
||||
// interpolation provided by resize_position_embeddings().
|
||||
ggml_tensor * pos_embd = nullptr;
|
||||
if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
|
||||
pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
|
||||
ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
|
||||
ggml_set_input(pos_embd);
|
||||
} else {
|
||||
pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,14 @@ struct clip_graph_youtuvl : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_yasa2 : clip_graph {
|
||||
clip_graph_yasa2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps = 1e-6f);
|
||||
ggml_tensor * convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b);
|
||||
};
|
||||
|
||||
struct clip_graph_minicpmv : clip_graph {
|
||||
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
191
tools/mtmd/models/yasa2.cpp
Normal file
191
tools/mtmd/models/yasa2.cpp
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
// ABOUTME: Yasa2 vision encoder graph builder for ConvNeXt-based architecture.
|
||||
// ABOUTME: Implements patch embedding, ConvNeXt stages with GRN, and adaptive pooling.
|
||||
|
||||
#include "models.h"
|
||||
|
||||
static ggml_tensor * add_channel_bias(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * x_whcb,
|
||||
ggml_tensor * b_c) {
|
||||
if (!b_c) {
|
||||
return x_whcb;
|
||||
}
|
||||
ggml_tensor * b4 = ggml_reshape_4d(ctx0, b_c, 1, 1, b_c->ne[0], 1);
|
||||
return ggml_add(ctx0, x_whcb, b4);
|
||||
}
|
||||
|
||||
static ggml_tensor * mul_channel_weight(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * x_whcb,
|
||||
ggml_tensor * w_c) {
|
||||
if (!w_c) {
|
||||
return x_whcb;
|
||||
}
|
||||
ggml_tensor * w4 = ggml_reshape_4d(ctx0, w_c, 1, 1, w_c->ne[0], 1);
|
||||
return ggml_mul(ctx0, x_whcb, w4);
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_yasa2::layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps) {
|
||||
// Match HF ConvNextLayerNorm(channels_first):
|
||||
// u = mean_c(x), s = mean_c((x-u)^2), x = (x-u)/sqrt(s+eps)
|
||||
// cast back to input dtype before affine.
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [W,H,C,B] -> [C,H,W,B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
ggml_tensor * u = ggml_mean(ctx0, cur); // [1,H,W,B]
|
||||
ggml_tensor * xm = ggml_sub(ctx0, cur, u); // [C,H,W,B]
|
||||
|
||||
ggml_tensor * s = ggml_mul(ctx0, xm, xm); // [C,H,W,B]
|
||||
s = ggml_mean(ctx0, s); // [1,H,W,B]
|
||||
s = ggml_clamp(ctx0, s, eps, 1e30f); // avoid div-by-zero in no-alloc warmup
|
||||
s = ggml_sqrt(ctx0, s); // [1,H,W,B]
|
||||
|
||||
ggml_tensor * xhat = ggml_div(ctx0, xm, s); // [C,H,W,B]
|
||||
xhat = ggml_permute(ctx0, xhat, 2, 1, 0, 3); // [W,H,C,B]
|
||||
xhat = ggml_cont(ctx0, xhat);
|
||||
xhat = mul_channel_weight(ctx0, xhat, w);
|
||||
xhat = add_channel_bias(ctx0, xhat, b);
|
||||
return xhat;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_yasa2::convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b) {
|
||||
// Exact ConvNeXtV2 GRN:
|
||||
// Gx = ||x||_2 over spatial dims (W,H), Nx = Gx / (mean_c(Gx) + eps)
|
||||
// y = w * (x * Nx) + b + x
|
||||
const int64_t wdim = inp->ne[0];
|
||||
const int64_t hdim = inp->ne[1];
|
||||
const int64_t cdim = inp->ne[2];
|
||||
const int64_t bdim = inp->ne[3];
|
||||
|
||||
// Keep GRN math in fp32 for stability; fp16/bf16 accumulation can drift.
|
||||
ggml_tensor * sq = ggml_mul(ctx0, inp, inp);
|
||||
ggml_tensor * sq_flat = ggml_reshape_4d(ctx0, sq, wdim * hdim, cdim, 1, bdim); // [WH,C,1,B]
|
||||
ggml_tensor * gx = ggml_sum_rows(ctx0, sq_flat); // [1,C,1,B]
|
||||
gx = ggml_sqrt(ctx0, gx); // [1,C,1,B]
|
||||
|
||||
ggml_tensor * gx_ch_first = ggml_permute(ctx0, gx, 1, 0, 2, 3); // [C,1,1,B]
|
||||
gx_ch_first = ggml_cont(ctx0, gx_ch_first);
|
||||
ggml_tensor * gx_mean = ggml_mean(ctx0, gx_ch_first); // [1,1,1,B]
|
||||
|
||||
gx_mean = ggml_clamp(ctx0, gx_mean, 1e-6f, 1e30f); // approx +eps, warmup-safe
|
||||
ggml_tensor * nx = ggml_div(ctx0, gx, gx_mean); // [1,C,1,B]
|
||||
nx = ggml_permute(ctx0, nx, 0, 2, 1, 3); // [1,1,C,B]
|
||||
nx = ggml_cont(ctx0, nx);
|
||||
|
||||
ggml_tensor * xnx = ggml_mul(ctx0, inp, nx);
|
||||
xnx = mul_channel_weight(ctx0, xnx, w);
|
||||
xnx = add_channel_bias(ctx0, xnx, b);
|
||||
return ggml_add(ctx0, inp, xnx);
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_yasa2::build() {
|
||||
ggml_tensor * cur = build_inp_raw();
|
||||
|
||||
// Patch embedding Conv2d(kernel=4, stride=4)
|
||||
cur = ggml_conv_2d(ctx0, model.yasa_patch_w, cur, patch_size, patch_size, 0, 0, 1, 1);
|
||||
cur = add_channel_bias(ctx0, cur, model.yasa_patch_b);
|
||||
ggml_set_name(cur, "yasa2_patch_conv_out");
|
||||
cb(cur, "yasa2_patch_conv_out", -1);
|
||||
cur = layer_norm_channels(cur, model.yasa_patch_ln_w, model.yasa_patch_ln_b, eps);
|
||||
ggml_set_name(cur, "yasa2_patch_ln_out");
|
||||
cb(cur, "yasa2_patch_ln_out", -1);
|
||||
|
||||
// ConvNeXt stages
|
||||
for (size_t s = 0; s < model.yasa_stages.size(); ++s) {
|
||||
const auto & stage = model.yasa_stages[s];
|
||||
|
||||
if (stage.down_conv_w) {
|
||||
cur = layer_norm_channels(cur, stage.down_ln_w, stage.down_ln_b, eps);
|
||||
cur = ggml_conv_2d(ctx0, stage.down_conv_w, cur, 2, 2, 0, 0, 1, 1);
|
||||
cur = add_channel_bias(ctx0, cur, stage.down_conv_b);
|
||||
ggml_format_name(cur, "yasa2_stage%zu_down_out", s);
|
||||
}
|
||||
|
||||
for (size_t bi = 0; bi < stage.blocks.size(); ++bi) {
|
||||
const auto & blk = stage.blocks[bi];
|
||||
ggml_tensor * res = cur;
|
||||
|
||||
ggml_tensor * x = ggml_conv_2d_dw(ctx0, blk.dw_w, cur, 1, 1, 3, 3, 1, 1);
|
||||
x = add_channel_bias(ctx0, x, blk.dw_b);
|
||||
x = layer_norm_channels(x, blk.ln_w, blk.ln_b, eps);
|
||||
|
||||
// pwconv1/pwconv2 are HF Linear layers over channels; implement via matmul on tokens.
|
||||
const int64_t w = x->ne[0];
|
||||
const int64_t h = x->ne[1];
|
||||
const int64_t b = x->ne[3];
|
||||
|
||||
ggml_tensor * tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,C,B]
|
||||
tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [C,T,B]
|
||||
tok = ggml_cont(ctx0, tok);
|
||||
|
||||
tok = ggml_mul_mat(ctx0, blk.pw1_w, tok); // [4C,T,B]
|
||||
if (blk.pw1_b) {
|
||||
ggml_tensor * b1 = ggml_reshape_3d(ctx0, blk.pw1_b, blk.pw1_b->ne[0], 1, 1); // [4C,1,1]
|
||||
tok = ggml_add(ctx0, tok, b1);
|
||||
}
|
||||
x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,4C,B]
|
||||
x = ggml_cont(ctx0, x);
|
||||
x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,4C,B]
|
||||
x = ggml_gelu_erf(ctx0, x);
|
||||
x = convnext_grn(x, blk.grn_w, blk.grn_b);
|
||||
|
||||
tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,4C,B]
|
||||
tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [4C,T,B]
|
||||
tok = ggml_cont(ctx0, tok);
|
||||
|
||||
tok = ggml_mul_mat(ctx0, blk.pw2_w, tok); // [C,T,B]
|
||||
if (blk.pw2_b) {
|
||||
ggml_tensor * b2 = ggml_reshape_3d(ctx0, blk.pw2_b, blk.pw2_b->ne[0], 1, 1); // [C,1,1]
|
||||
tok = ggml_add(ctx0, tok, b2);
|
||||
}
|
||||
x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,C,B]
|
||||
x = ggml_cont(ctx0, x);
|
||||
x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,C,B]
|
||||
|
||||
cur = ggml_add(ctx0, res, x);
|
||||
ggml_format_name(cur, "yasa2_stage%zu_blk%zu_out", s, bi);
|
||||
}
|
||||
}
|
||||
|
||||
// HF path adds vision position embeddings BEFORE adaptive pooling.
|
||||
const int64_t pre_w = cur->ne[0];
|
||||
const int64_t pre_h = cur->ne[1];
|
||||
ggml_tensor * tokens_pre = ggml_reshape_3d(ctx0, cur, pre_w * pre_h, cur->ne[2], cur->ne[3]); // [T,C,B]
|
||||
tokens_pre = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [C,T,B]
|
||||
tokens_pre = ggml_cont(ctx0, tokens_pre);
|
||||
if (model.yasa_vision_pos_embed && tokens_pre->ne[1] == model.yasa_vision_pos_embed->ne[1]) {
|
||||
const int64_t n_ch = model.yasa_vision_pos_embed->ne[0];
|
||||
const int64_t n_tokens = model.yasa_vision_pos_embed->ne[1];
|
||||
ggml_tensor * pos = ggml_reshape_3d(ctx0, model.yasa_vision_pos_embed, (int) n_ch, (int) n_tokens, 1);
|
||||
tokens_pre = ggml_add(ctx0, tokens_pre, pos);
|
||||
}
|
||||
cur = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [T,C,B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_4d(ctx0, cur, pre_w, pre_h, cur->ne[1], cur->ne[2]); // [W,H,C,B]
|
||||
|
||||
// AdaptiveAvgPool2d target is 8x8 for real inputs, but warmup can use tiny images.
|
||||
const int pooled_w = std::min(8, (int) cur->ne[0]);
|
||||
const int pooled_h = std::min(8, (int) cur->ne[1]);
|
||||
const int kw = std::max(1, (int) cur->ne[0] / pooled_w);
|
||||
const int kh = std::max(1, (int) cur->ne[1] / pooled_h);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kw, kh, kw, kh, 0, 0);
|
||||
|
||||
// [W,H,C,B] -> [C,T,B]
|
||||
ggml_tensor * tokens = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||
tokens = ggml_permute(ctx0, tokens, 1, 0, 2, 3);
|
||||
tokens = ggml_cont(ctx0, tokens);
|
||||
cb(tokens, "yasa2_tokens", -1);
|
||||
|
||||
GGML_ASSERT(model.mm_0_w && model.mm_2_w);
|
||||
ggml_tensor * embeddings = build_ffn(
|
||||
tokens,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
cb(embeddings, "yasa2_emb", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -35,15 +35,23 @@ struct mtmd_bitmap {
|
|||
|
||||
// position indexing for decoder model
|
||||
enum mtmd_pos_type {
|
||||
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
|
||||
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
|
||||
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
|
||||
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
|
||||
MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
|
||||
uint32_t n_tokens() const {
|
||||
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
return nx * ny;
|
||||
}
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
|
||||
|
|
@ -52,6 +60,7 @@ struct mtmd_image_tokens {
|
|||
nx,
|
||||
ny,
|
||||
pos,
|
||||
image_idx,
|
||||
batch_f32.clone(),
|
||||
id
|
||||
};
|
||||
|
|
@ -186,6 +195,7 @@ struct mtmd_context {
|
|||
|
||||
auto decoder_rope_type = llama_model_rope_type(text_model);
|
||||
switch (decoder_rope_type) {
|
||||
case LLAMA_ROPE_TYPE_NONE:
|
||||
case LLAMA_ROPE_TYPE_NORM:
|
||||
case LLAMA_ROPE_TYPE_NEOX:
|
||||
{
|
||||
|
|
@ -316,6 +326,19 @@ struct mtmd_context {
|
|||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
{
|
||||
img_beg = "<image>";
|
||||
img_end = "</image>";
|
||||
// Currently only supprots single-tile preprocessing: any input is downscaled
|
||||
// to one image_size x image_size tile (64 output tokens via 8x8 adaptive avg
|
||||
// pool).
|
||||
// However, the model itself supports llava-uhd multi-tile tiling for high-res
|
||||
// images. This will be implemented in a future PR (dispatch on has_pinpoints
|
||||
// - see LDP/COGVLM branch above) and emit image_grid_pinpoints in the conversion
|
||||
// script.
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
|
|
@ -453,6 +476,7 @@ struct mtmd_context {
|
|||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
img_beg = "<|hy_place▁holder▁no▁100|>";
|
||||
|
|
@ -598,6 +622,7 @@ struct mtmd_tokenizer {
|
|||
const llama_vocab * vocab;
|
||||
|
||||
mtmd_input_chunks cur;
|
||||
uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
|
||||
|
||||
mtmd_tokenizer(mtmd_context * ctx,
|
||||
const mtmd_input_text * text,
|
||||
|
|
@ -806,6 +831,14 @@ struct mtmd_tokenizer {
|
|||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->pos = ctx->pos_type;
|
||||
// HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
|
||||
// and uses XD-RoPE (dim-3 = image index). Override the position type so that
|
||||
// n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
|
||||
image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL;
|
||||
image_tokens->image_idx = n_images_added;
|
||||
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmap->id; // optional
|
||||
|
||||
|
|
@ -826,6 +859,9 @@ struct mtmd_tokenizer {
|
|||
add_text(ctx->img_end, true); // add image end token
|
||||
}
|
||||
|
||||
// advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
|
||||
n_images_added++;
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
|
|
@ -1273,6 +1309,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
|
|||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} break;
|
||||
case MTMD_POS_TYPE_HUNYUANVL:
|
||||
{
|
||||
// HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
// Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
|
||||
// content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
|
||||
const uint32_t nx = image_tokens->nx;
|
||||
const uint32_t n_total = image_tokens->n_tokens();
|
||||
if (i == 0) {
|
||||
// BOI
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = pos_0 + i;
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} else if (i == n_total - 1) {
|
||||
// EOI
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = pos_0 + i;
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} else {
|
||||
// content token at (row, col), or the trailing newline of a row (col == nx)
|
||||
// section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
|
||||
// set_position_mrope_2d writes .y -> section 1 and .x -> section 2
|
||||
const uint32_t offset = (uint32_t)i - 1;
|
||||
const uint32_t row = offset / (nx + 1);
|
||||
const uint32_t col = offset % (nx + 1);
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = row;
|
||||
pos.y = col;
|
||||
pos.z = image_tokens->image_idx;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
|
|
@ -1289,6 +1357,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
|||
return std::max(image_tokens->nx, image_tokens->ny);
|
||||
case MTMD_POS_TYPE_NORMAL:
|
||||
return image_tokens->n_tokens();
|
||||
case MTMD_POS_TYPE_HUNYUANVL:
|
||||
// HunyuanVL: the sequential (dim-0) position advances by the full token count
|
||||
// (includes BOI/EOI and row newline tokens), not by max(nx, ny)
|
||||
return image_tokens->n_tokens();
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
|
|||
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
|
||||
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
|
||||
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
|
||||
add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
|
|
|
|||
588
tools/server/server-chat.cpp
Normal file
588
tools/server/server-chat.cpp
Normal file
|
|
@ -0,0 +1,588 @@
|
|||
#include "server-chat.h"
|
||||
#include "server-common.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
json server_chat_convert_responses_to_chatcmpl(const json & response_body) {
|
||||
if (!response_body.contains("input")) {
|
||||
throw std::invalid_argument("'input' is required");
|
||||
}
|
||||
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
|
||||
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||
}
|
||||
|
||||
const json input_value = response_body.at("input");
|
||||
json chatcmpl_body = response_body;
|
||||
chatcmpl_body.erase("input");
|
||||
std::vector<json> chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("instructions")) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", json_value(response_body, "instructions", std::string())},
|
||||
});
|
||||
chatcmpl_body.erase("instructions");
|
||||
}
|
||||
|
||||
if (input_value.is_string()) {
|
||||
// #responses_create-input-text_input
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", input_value},
|
||||
});
|
||||
} else if (input_value.is_array()) {
|
||||
// #responses_create-input-input_item_list
|
||||
|
||||
static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_array();
|
||||
};
|
||||
static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_string();
|
||||
};
|
||||
|
||||
for (json item : input_value) {
|
||||
bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant";
|
||||
|
||||
if (exists_and_is_string(item, "content")) {
|
||||
// #responses_create-input-input_item_list-input_message-content-text_input
|
||||
// Only "Input message" contains item["content"]::string
|
||||
// After converting item["content"]::string to item["content"]::array,
|
||||
// we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
|
||||
item["content"] = json::array({
|
||||
json {
|
||||
{"text", item.at("content")},
|
||||
{"type", "input_text"}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
(item.at("role") == "user" ||
|
||||
item.at("role") == "system" ||
|
||||
item.at("role") == "developer")
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-input_message
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const json & input_item : item.at("content")) {
|
||||
const std::string type = json_value(input_item, "type", std::string());
|
||||
|
||||
if (type == "input_text") {
|
||||
if (!input_item.contains("text")) {
|
||||
throw std::invalid_argument("'Input text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", input_item.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
} else if (type == "input_image") {
|
||||
// While `detail` is marked as required,
|
||||
// it has default value("auto") and can be omitted.
|
||||
|
||||
if (!input_item.contains("image_url")) {
|
||||
throw std::invalid_argument("'image_url' is required");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"image_url", json {
|
||||
{"url", input_item.at("image_url")}
|
||||
}},
|
||||
{"type", "image_url"},
|
||||
});
|
||||
} else if (type == "input_file") {
|
||||
throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
|
||||
}
|
||||
}
|
||||
|
||||
if (item.contains("type")) {
|
||||
item.erase("type");
|
||||
}
|
||||
if (item.contains("status")) {
|
||||
item.erase("status");
|
||||
}
|
||||
item["content"] = chatcmpl_content;
|
||||
|
||||
chatcmpl_messages.push_back(item);
|
||||
} else if (exists_and_is_string(item, "role") &&
|
||||
item.at("role") == "assistant" &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "message"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-output_message
|
||||
auto chatcmpl_content = json::array();
|
||||
|
||||
// Handle both string content and array content
|
||||
if (item.contains("content") && item.at("content").is_string()) {
|
||||
// String content - convert to text content part
|
||||
chatcmpl_content.push_back({
|
||||
{"text", item.at("content")},
|
||||
{"type", "text"},
|
||||
});
|
||||
} else if (exists_and_is_array(item, "content")) {
|
||||
// Array content - process each item
|
||||
for (const auto & output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type == "output_text" || type == "input_text") {
|
||||
// Accept both output_text and input_text (string content gets converted to input_text)
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
} else if (type == "refusal") {
|
||||
if (!exists_and_is_string(output_text, "refusal")) {
|
||||
throw std::invalid_argument("'Refusal' requires 'refusal'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"refusal", output_text.at("refusal")},
|
||||
{"type", "refusal"},
|
||||
});
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
if (!exists_and_is_array(prev_msg, "content")) {
|
||||
prev_msg["content"] = json::array();
|
||||
}
|
||||
auto & prev_content = prev_msg["content"];
|
||||
prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
|
||||
} else {
|
||||
item.erase("status");
|
||||
item.erase("type");
|
||||
item["content"] = chatcmpl_content;
|
||||
chatcmpl_messages.push_back(item);
|
||||
}
|
||||
} else if (exists_and_is_string(item, "arguments") &&
|
||||
exists_and_is_string(item, "call_id") &&
|
||||
exists_and_is_string(item, "name") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call
|
||||
json tool_call = {
|
||||
{"function", json {
|
||||
{"arguments", item.at("arguments")},
|
||||
{"name", item.at("name")},
|
||||
}},
|
||||
{"id", item.at("call_id")},
|
||||
{"type", "function"},
|
||||
};
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
if (!exists_and_is_array(prev_msg, "tool_calls")) {
|
||||
prev_msg["tool_calls"] = json::array();
|
||||
}
|
||||
prev_msg["tool_calls"].push_back(tool_call);
|
||||
} else {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({tool_call})}
|
||||
});
|
||||
}
|
||||
} else if (exists_and_is_string(item, "call_id") &&
|
||||
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call_output"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call_output
|
||||
if (item.at("output").is_string()) {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", item.at("output")},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
} else {
|
||||
json chatcmpl_outputs = item.at("output");
|
||||
for (json & chatcmpl_output : chatcmpl_outputs) {
|
||||
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
|
||||
throw std::invalid_argument("Output of tool call should be 'Input text'");
|
||||
}
|
||||
chatcmpl_output["type"] = "text";
|
||||
}
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", chatcmpl_outputs},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
}
|
||||
} else if (exists_and_is_array(item, "summary") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "reasoning") {
|
||||
// #responses_create-input-input_item_list-item-reasoning
|
||||
|
||||
if (!exists_and_is_array(item, "content")) {
|
||||
throw std::invalid_argument("item['content'] is not an array");
|
||||
}
|
||||
if (item.at("content").empty()) {
|
||||
throw std::invalid_argument("item['content'] is empty");
|
||||
}
|
||||
if (!exists_and_is_string(item.at("content")[0], "text")) {
|
||||
throw std::invalid_argument("item['content']['text'] is not a string");
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
prev_msg["reasoning_content"] = item.at("content")[0].at("text");
|
||||
} else {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"role", "assistant"},
|
||||
{"content", json::array()},
|
||||
{"reasoning_content", item.at("content")[0].at("text")},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("Cannot determine type of 'item'");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'input' must be a string or array of objects");
|
||||
}
|
||||
|
||||
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("tools")) {
|
||||
if (!response_body.at("tools").is_array()) {
|
||||
throw std::invalid_argument("'tools' must be an array of objects");
|
||||
}
|
||||
std::vector<json> chatcmpl_tools;
|
||||
for (json resp_tool : response_body.at("tools")) {
|
||||
json chatcmpl_tool;
|
||||
|
||||
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||
throw std::invalid_argument("'type' of tool must be 'function'");
|
||||
}
|
||||
resp_tool.erase("type");
|
||||
chatcmpl_tool["type"] = "function";
|
||||
|
||||
if (!resp_tool.contains("strict")) {
|
||||
resp_tool["strict"] = true;
|
||||
}
|
||||
chatcmpl_tool["function"] = resp_tool;
|
||||
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||
}
|
||||
chatcmpl_body.erase("tools");
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
}
|
||||
|
||||
if (response_body.contains("max_output_tokens")) {
|
||||
chatcmpl_body.erase("max_output_tokens");
|
||||
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json server_chat_convert_anthropic_to_oai(const json & body) {
|
||||
json oai_body;
|
||||
|
||||
// Convert system prompt
|
||||
json oai_messages = json::array();
|
||||
auto system_param = json_value(body, "system", json());
|
||||
if (!system_param.is_null()) {
|
||||
std::string system_content;
|
||||
|
||||
if (system_param.is_string()) {
|
||||
system_content = system_param.get<std::string>();
|
||||
} else if (system_param.is_array()) {
|
||||
for (const auto & block : system_param) {
|
||||
if (json_value(block, "type", std::string()) == "text") {
|
||||
system_content += json_value(block, "text", std::string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
oai_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", system_content}
|
||||
});
|
||||
}
|
||||
|
||||
// Convert messages
|
||||
if (!body.contains("messages")) {
|
||||
throw std::runtime_error("'messages' is required");
|
||||
}
|
||||
const json & messages = body.at("messages");
|
||||
if (messages.is_array()) {
|
||||
for (const auto & msg : messages) {
|
||||
std::string role = json_value(msg, "role", std::string());
|
||||
|
||||
if (!msg.contains("content")) {
|
||||
if (role == "assistant") {
|
||||
continue;
|
||||
}
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
const json & content = msg.at("content");
|
||||
|
||||
if (content.is_string()) {
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!content.is_array()) {
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
json tool_calls = json::array();
|
||||
json converted_content = json::array();
|
||||
json tool_results = json::array();
|
||||
std::string reasoning_content;
|
||||
bool has_tool_calls = false;
|
||||
|
||||
for (const auto & block : content) {
|
||||
std::string type = json_value(block, "type", std::string());
|
||||
|
||||
if (type == "text") {
|
||||
converted_content.push_back(block);
|
||||
} else if (type == "thinking") {
|
||||
reasoning_content += json_value(block, "thinking", std::string());
|
||||
} else if (type == "image") {
|
||||
json source = json_value(block, "source", json::object());
|
||||
std::string source_type = json_value(source, "type", std::string());
|
||||
|
||||
if (source_type == "base64") {
|
||||
std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
|
||||
std::string data = json_value(source, "data", std::string());
|
||||
std::ostringstream ss;
|
||||
ss << "data:" << media_type << ";base64," << data;
|
||||
|
||||
converted_content.push_back({
|
||||
{"type", "image_url"},
|
||||
{"image_url", {
|
||||
{"url", ss.str()}
|
||||
}}
|
||||
});
|
||||
} else if (source_type == "url") {
|
||||
std::string url = json_value(source, "url", std::string());
|
||||
converted_content.push_back({
|
||||
{"type", "image_url"},
|
||||
{"image_url", {
|
||||
{"url", url}
|
||||
}}
|
||||
});
|
||||
}
|
||||
} else if (type == "tool_use") {
|
||||
tool_calls.push_back({
|
||||
{"id", json_value(block, "id", std::string())},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", json_value(block, "name", std::string())},
|
||||
{"arguments", json_value(block, "input", json::object()).dump()}
|
||||
}}
|
||||
});
|
||||
has_tool_calls = true;
|
||||
} else if (type == "tool_result") {
|
||||
std::string tool_use_id = json_value(block, "tool_use_id", std::string());
|
||||
|
||||
auto result_content = json_value(block, "content", json());
|
||||
std::string result_text;
|
||||
if (result_content.is_string()) {
|
||||
result_text = result_content.get<std::string>();
|
||||
} else if (result_content.is_array()) {
|
||||
for (const auto & c : result_content) {
|
||||
if (json_value(c, "type", std::string()) == "text") {
|
||||
result_text += json_value(c, "text", std::string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tool_results.push_back({
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", tool_use_id},
|
||||
{"content", result_text}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (!converted_content.empty() || has_tool_calls || !reasoning_content.empty()) {
|
||||
json new_msg = {{"role", role}};
|
||||
if (!converted_content.empty()) {
|
||||
new_msg["content"] = converted_content;
|
||||
} else if (has_tool_calls || !reasoning_content.empty()) {
|
||||
new_msg["content"] = "";
|
||||
}
|
||||
if (!tool_calls.empty()) {
|
||||
new_msg["tool_calls"] = tool_calls;
|
||||
}
|
||||
if (!reasoning_content.empty()) {
|
||||
new_msg["reasoning_content"] = reasoning_content;
|
||||
}
|
||||
oai_messages.push_back(new_msg);
|
||||
}
|
||||
|
||||
for (const auto & tool_msg : tool_results) {
|
||||
oai_messages.push_back(tool_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
oai_body["messages"] = oai_messages;
|
||||
|
||||
// Convert tools
|
||||
if (body.contains("tools")) {
|
||||
const json & tools = body.at("tools");
|
||||
if (tools.is_array()) {
|
||||
json oai_tools = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
oai_tools.push_back({
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", json_value(tool, "name", std::string())},
|
||||
{"description", json_value(tool, "description", std::string())},
|
||||
{"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
|
||||
}}
|
||||
});
|
||||
}
|
||||
oai_body["tools"] = oai_tools;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert tool_choice
|
||||
if (body.contains("tool_choice")) {
|
||||
const json & tc = body.at("tool_choice");
|
||||
if (tc.is_object()) {
|
||||
std::string type = json_value(tc, "type", std::string());
|
||||
if (type == "auto") {
|
||||
oai_body["tool_choice"] = "auto";
|
||||
} else if (type == "any" || type == "tool") {
|
||||
oai_body["tool_choice"] = "required";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert stop_sequences to stop
|
||||
if (body.contains("stop_sequences")) {
|
||||
oai_body["stop"] = body.at("stop_sequences");
|
||||
}
|
||||
|
||||
// Handle max_tokens (required in Anthropic, but we're permissive)
|
||||
if (body.contains("max_tokens")) {
|
||||
oai_body["max_tokens"] = body.at("max_tokens");
|
||||
} else {
|
||||
oai_body["max_tokens"] = 4096;
|
||||
}
|
||||
|
||||
// Pass through common params
|
||||
for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
|
||||
if (body.contains(key)) {
|
||||
oai_body[key] = body.at(key);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle Anthropic-specific thinking param
|
||||
if (body.contains("thinking")) {
|
||||
json thinking = json_value(body, "thinking", json::object());
|
||||
std::string thinking_type = json_value(thinking, "type", std::string());
|
||||
if (thinking_type == "enabled") {
|
||||
int budget_tokens = json_value(thinking, "budget_tokens", 10000);
|
||||
oai_body["thinking_budget_tokens"] = budget_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle Anthropic-specific metadata param
|
||||
if (body.contains("metadata")) {
|
||||
json metadata = json_value(body, "metadata", json::object());
|
||||
std::string user_id = json_value(metadata, "user_id", std::string());
|
||||
if (!user_id.empty()) {
|
||||
oai_body["__metadata_user_id"] = user_id;
|
||||
}
|
||||
}
|
||||
|
||||
return oai_body;
|
||||
}
|
||||
|
||||
json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
||||
json delta = json::object();
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
delta["reasoning_content"] = diff.reasoning_content_delta;
|
||||
}
|
||||
if (!diff.content_delta.empty()) {
|
||||
delta["content"] = diff.content_delta;
|
||||
}
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
json tool_call;
|
||||
tool_call["index"] = diff.tool_call_index;
|
||||
if (!diff.tool_call_delta.id.empty()) {
|
||||
tool_call["id"] = diff.tool_call_delta.id;
|
||||
tool_call["type"] = "function";
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
|
||||
json function = json::object();
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
function["name"] = diff.tool_call_delta.name;
|
||||
}
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
function["arguments"] = diff.tool_call_delta.arguments;
|
||||
}
|
||||
tool_call["function"] = function;
|
||||
}
|
||||
delta["tool_calls"] = json::array({ tool_call });
|
||||
}
|
||||
return delta;
|
||||
}
|
||||
|
||||
json convert_transcriptions_to_chatcmpl(
|
||||
const json & inp_body,
|
||||
const std::map<std::string, raw_buffer> & in_files,
|
||||
std::vector<raw_buffer> & out_files) {
|
||||
// TODO @ngxson : this function may need to be improved in the future
|
||||
// handle input files
|
||||
out_files.clear();
|
||||
auto it = in_files.find("file");
|
||||
if (it != in_files.end()) {
|
||||
out_files.push_back(it->second);
|
||||
} else {
|
||||
throw std::invalid_argument("No input file found for transcription");
|
||||
}
|
||||
|
||||
// handle input data
|
||||
std::string prompt = json_value(inp_body, "prompt", std::string());
|
||||
std::string language = json_value(inp_body, "language", std::string());
|
||||
std::string response_format = json_value(inp_body, "response_format", std::string("json"));
|
||||
if (response_format != "json") {
|
||||
throw std::invalid_argument("Only 'json' response_format is supported for transcription");
|
||||
}
|
||||
if (prompt.empty()) {
|
||||
prompt = "Transcribe audio to text";
|
||||
}
|
||||
if (!language.empty()) {
|
||||
prompt += string_format(" (language: %s)", language.c_str());
|
||||
}
|
||||
prompt += get_media_marker();
|
||||
|
||||
json chatcmpl_body = inp_body; // copy all fields
|
||||
chatcmpl_body["messages"] = json::array({
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", prompt},
|
||||
},
|
||||
});
|
||||
|
||||
// because input from form-data, everything is string, we need to correct the types here
|
||||
std::string stream = json_value(inp_body, "stream", std::string("false"));
|
||||
chatcmpl_body["stream"] = stream == "true";
|
||||
|
||||
if (inp_body.contains("max_tokens")) {
|
||||
std::string inp = inp_body["max_tokens"].get<std::string>();
|
||||
chatcmpl_body["max_tokens"] = std::stoul(inp);
|
||||
}
|
||||
|
||||
if (inp_body.contains("temperature")) {
|
||||
std::string inp = inp_body["temperature"].get<std::string>();
|
||||
chatcmpl_body["temperature"] = std::stof(inp);
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
24
tools/server/server-chat.h
Normal file
24
tools/server/server-chat.h
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
// Chat conversion functions for server (Responses API, Anthropic API, OAI streaming diffs)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "server-common.h"
|
||||
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
// Convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||
json server_chat_convert_responses_to_chatcmpl(const json & body);
|
||||
|
||||
// Convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
json server_chat_convert_anthropic_to_oai(const json & body);
|
||||
|
||||
// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
|
||||
json convert_transcriptions_to_chatcmpl(
|
||||
const json & body,
|
||||
const std::map<std::string, raw_buffer> & in_files,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||
|
|
@ -1027,6 +1027,8 @@ json oaicompat_chat_params_parse(
|
|||
}
|
||||
}
|
||||
|
||||
auto caps = common_chat_templates_get_caps(opt.tmpls.get());
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
||||
|
|
@ -1034,7 +1036,7 @@ json oaicompat_chat_params_parse(
|
|||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||
inputs.grammar = grammar;
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
if (body.contains("reasoning_format")) {
|
||||
|
|
@ -1164,573 +1166,6 @@ json oaicompat_chat_params_parse(
|
|||
return llama_params;
|
||||
}
|
||||
|
||||
json convert_responses_to_chatcmpl(const json & response_body) {
|
||||
if (!response_body.contains("input")) {
|
||||
throw std::invalid_argument("'input' is required");
|
||||
}
|
||||
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
|
||||
throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
|
||||
}
|
||||
|
||||
const json input_value = response_body.at("input");
|
||||
json chatcmpl_body = response_body;
|
||||
chatcmpl_body.erase("input");
|
||||
std::vector<json> chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("instructions")) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", json_value(response_body, "instructions", std::string())},
|
||||
});
|
||||
chatcmpl_body.erase("instructions");
|
||||
}
|
||||
|
||||
if (input_value.is_string()) {
|
||||
// #responses_create-input-text_input
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", input_value},
|
||||
});
|
||||
} else if (input_value.is_array()) {
|
||||
// #responses_create-input-input_item_list
|
||||
|
||||
static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_array();
|
||||
};
|
||||
static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_string();
|
||||
};
|
||||
|
||||
for (json item : input_value) {
|
||||
bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant";
|
||||
|
||||
if (exists_and_is_string(item, "content")) {
|
||||
// #responses_create-input-input_item_list-input_message-content-text_input
|
||||
// Only "Input message" contains item["content"]::string
|
||||
// After converting item["content"]::string to item["content"]::array,
|
||||
// we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
|
||||
item["content"] = json::array({
|
||||
json {
|
||||
{"text", item.at("content")},
|
||||
{"type", "input_text"}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
(item.at("role") == "user" ||
|
||||
item.at("role") == "system" ||
|
||||
item.at("role") == "developer")
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-input_message
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const json & input_item : item.at("content")) {
|
||||
const std::string type = json_value(input_item, "type", std::string());
|
||||
|
||||
if (type == "input_text") {
|
||||
if (!input_item.contains("text")) {
|
||||
throw std::invalid_argument("'Input text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", input_item.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
} else if (type == "input_image") {
|
||||
// While `detail` is marked as required,
|
||||
// it has default value("auto") and can be omitted.
|
||||
|
||||
if (!input_item.contains("image_url")) {
|
||||
throw std::invalid_argument("'image_url' is required");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"image_url", json {
|
||||
{"url", input_item.at("image_url")}
|
||||
}},
|
||||
{"type", "image_url"},
|
||||
});
|
||||
} else if (type == "input_file") {
|
||||
throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
|
||||
// if (input_item.contains("file_url")) {
|
||||
// // chat completion API does not support file_url
|
||||
// throw std::invalid_argument("'file_url' is not supported");
|
||||
// }
|
||||
// if (!input_item.contains("file_data") || !input_item.contains("filename")) {
|
||||
// throw std::invalid_argument("Both 'file_data' and 'filename' are required");
|
||||
// }
|
||||
// chatcmpl_content.push_back({
|
||||
// {"file", json {
|
||||
// {"file_data", input_item.at("file_data")},
|
||||
// {"filename", input_item.at("filename")},
|
||||
// }},
|
||||
// {"type", "file"},
|
||||
// });
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
|
||||
}
|
||||
}
|
||||
|
||||
if (item.contains("type")) {
|
||||
item.erase("type");
|
||||
}
|
||||
if (item.contains("status")) {
|
||||
item.erase("status");
|
||||
}
|
||||
item["content"] = chatcmpl_content;
|
||||
|
||||
chatcmpl_messages.push_back(item);
|
||||
} else if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
item.at("role") == "assistant" &&
|
||||
// exists_and_is_string(item, "status") &&
|
||||
// (item.at("status") == "in_progress" ||
|
||||
// item.at("status") == "completed" ||
|
||||
// item.at("status") == "incomplete") &&
|
||||
// item["status"] not sent by codex-cli
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "message"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-output_message
|
||||
auto chatcmpl_content = json::array();
|
||||
|
||||
for (const auto & output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type == "output_text") {
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
} else if (type == "refusal") {
|
||||
if (!exists_and_is_string(output_text, "refusal")) {
|
||||
throw std::invalid_argument("'Refusal' requires 'refusal'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"refusal", output_text.at("refusal")},
|
||||
{"type", "refusal"},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
|
||||
}
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
if (!exists_and_is_array(prev_msg, "content")) {
|
||||
prev_msg["content"] = json::array();
|
||||
}
|
||||
auto & prev_content = prev_msg["content"];
|
||||
prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
|
||||
} else {
|
||||
item.erase("status");
|
||||
item.erase("type");
|
||||
item["content"] = chatcmpl_content;
|
||||
chatcmpl_messages.push_back(item);
|
||||
}
|
||||
} else if (exists_and_is_string(item, "arguments") &&
|
||||
exists_and_is_string(item, "call_id") &&
|
||||
exists_and_is_string(item, "name") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call
|
||||
json tool_call = {
|
||||
{"function", json {
|
||||
{"arguments", item.at("arguments")},
|
||||
{"name", item.at("name")},
|
||||
}},
|
||||
{"id", item.at("call_id")},
|
||||
{"type", "function"},
|
||||
};
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
if (!exists_and_is_array(prev_msg, "tool_calls")) {
|
||||
prev_msg["tool_calls"] = json::array();
|
||||
}
|
||||
prev_msg["tool_calls"].push_back(tool_call);
|
||||
} else {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({tool_call})}
|
||||
});
|
||||
}
|
||||
} else if (exists_and_is_string(item, "call_id") &&
|
||||
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call_output"
|
||||
) {
|
||||
// #responses_create-input-input_item_list-item-function_tool_call_output
|
||||
if (item.at("output").is_string()) {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", item.at("output")},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
} else {
|
||||
json chatcmpl_outputs = item.at("output");
|
||||
for (json & chatcmpl_output : chatcmpl_outputs) {
|
||||
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
|
||||
throw std::invalid_argument("Output of tool call should be 'Input text'");
|
||||
}
|
||||
chatcmpl_output["type"] = "text";
|
||||
}
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"content", chatcmpl_outputs},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
}
|
||||
} else if (// exists_and_is_string(item, "id") &&
|
||||
// item["id"] not sent by codex-cli
|
||||
exists_and_is_array(item, "summary") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "reasoning") {
|
||||
// #responses_create-input-input_item_list-item-reasoning
|
||||
|
||||
if (!exists_and_is_array(item, "content")) {
|
||||
throw std::invalid_argument("item['content'] is not an array");
|
||||
}
|
||||
if (item.at("content").empty()) {
|
||||
throw std::invalid_argument("item['content'] is empty");
|
||||
}
|
||||
if (!exists_and_is_string(item.at("content")[0], "text")) {
|
||||
throw std::invalid_argument("item['content']['text'] is not a string");
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
auto & prev_msg = chatcmpl_messages.back();
|
||||
prev_msg["reasoning_content"] = item.at("content")[0].at("text");
|
||||
} else {
|
||||
chatcmpl_messages.push_back(json {
|
||||
{"role", "assistant"},
|
||||
{"content", json::array()},
|
||||
{"reasoning_content", item.at("content")[0].at("text")},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("Cannot determine type of 'item'");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'input' must be a string or array of objects");
|
||||
}
|
||||
|
||||
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("tools")) {
|
||||
if (!response_body.at("tools").is_array()) {
|
||||
throw std::invalid_argument("'tools' must be an array of objects");
|
||||
}
|
||||
std::vector<json> chatcmpl_tools;
|
||||
for (json resp_tool : response_body.at("tools")) {
|
||||
json chatcmpl_tool;
|
||||
|
||||
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||
throw std::invalid_argument("'type' of tool must be 'function'");
|
||||
}
|
||||
resp_tool.erase("type");
|
||||
chatcmpl_tool["type"] = "function";
|
||||
|
||||
if (!resp_tool.contains("strict")) {
|
||||
resp_tool["strict"] = true;
|
||||
}
|
||||
chatcmpl_tool["function"] = resp_tool;
|
||||
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||
}
|
||||
chatcmpl_body.erase("tools");
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
}
|
||||
|
||||
if (response_body.contains("max_output_tokens")) {
|
||||
chatcmpl_body.erase("max_output_tokens");
|
||||
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json convert_transcriptions_to_chatcmpl(
|
||||
const json & inp_body,
|
||||
const std::map<std::string, raw_buffer> & in_files,
|
||||
std::vector<raw_buffer> & out_files) {
|
||||
// TODO @ngxson : this function may need to be improved in the future
|
||||
// handle input files
|
||||
out_files.clear();
|
||||
auto it = in_files.find("file");
|
||||
if (it != in_files.end()) {
|
||||
out_files.push_back(it->second);
|
||||
} else {
|
||||
throw std::invalid_argument("No input file found for transcription");
|
||||
}
|
||||
|
||||
// handle input data
|
||||
std::string prompt = json_value(inp_body, "prompt", std::string());
|
||||
std::string language = json_value(inp_body, "language", std::string());
|
||||
std::string response_format = json_value(inp_body, "response_format", std::string("json"));
|
||||
if (response_format != "json") {
|
||||
throw std::invalid_argument("Only 'json' response_format is supported for transcription");
|
||||
}
|
||||
if (prompt.empty()) {
|
||||
prompt = "Transcribe audio to text";
|
||||
}
|
||||
if (!language.empty()) {
|
||||
prompt += string_format(" (language: %s)", language.c_str());
|
||||
}
|
||||
prompt += get_media_marker();
|
||||
|
||||
json chatcmpl_body = inp_body; // copy all fields
|
||||
chatcmpl_body["messages"] = json::array({
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", prompt},
|
||||
},
|
||||
});
|
||||
|
||||
// because input from form-data, everything is string, we need to correct the types here
|
||||
std::string stream = json_value(inp_body, "stream", std::string("false"));
|
||||
chatcmpl_body["stream"] = stream == "true";
|
||||
|
||||
if (inp_body.contains("max_tokens")) {
|
||||
std::string inp = inp_body["max_tokens"].get<std::string>();
|
||||
chatcmpl_body["max_tokens"] = std::stoul(inp);
|
||||
}
|
||||
|
||||
if (inp_body.contains("temperature")) {
|
||||
std::string inp = inp_body["temperature"].get<std::string>();
|
||||
chatcmpl_body["temperature"] = std::stof(inp);
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json convert_anthropic_to_oai(const json & body) {
|
||||
json oai_body;
|
||||
|
||||
// Convert system prompt
|
||||
json oai_messages = json::array();
|
||||
auto system_param = json_value(body, "system", json());
|
||||
if (!system_param.is_null()) {
|
||||
std::string system_content;
|
||||
|
||||
if (system_param.is_string()) {
|
||||
system_content = system_param.get<std::string>();
|
||||
} else if (system_param.is_array()) {
|
||||
for (const auto & block : system_param) {
|
||||
if (json_value(block, "type", std::string()) == "text") {
|
||||
system_content += json_value(block, "text", std::string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
oai_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", system_content}
|
||||
});
|
||||
}
|
||||
|
||||
// Convert messages
|
||||
if (!body.contains("messages")) {
|
||||
throw std::runtime_error("'messages' is required");
|
||||
}
|
||||
const json & messages = body.at("messages");
|
||||
if (messages.is_array()) {
|
||||
for (const auto & msg : messages) {
|
||||
std::string role = json_value(msg, "role", std::string());
|
||||
|
||||
if (!msg.contains("content")) {
|
||||
if (role == "assistant") {
|
||||
continue;
|
||||
}
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
const json & content = msg.at("content");
|
||||
|
||||
if (content.is_string()) {
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!content.is_array()) {
|
||||
oai_messages.push_back(msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
json tool_calls = json::array();
|
||||
json converted_content = json::array();
|
||||
json tool_results = json::array();
|
||||
std::string reasoning_content;
|
||||
bool has_tool_calls = false;
|
||||
|
||||
for (const auto & block : content) {
|
||||
std::string type = json_value(block, "type", std::string());
|
||||
|
||||
if (type == "text") {
|
||||
converted_content.push_back(block);
|
||||
} else if (type == "thinking") {
|
||||
reasoning_content += json_value(block, "thinking", std::string());
|
||||
} else if (type == "image") {
|
||||
json source = json_value(block, "source", json::object());
|
||||
std::string source_type = json_value(source, "type", std::string());
|
||||
|
||||
if (source_type == "base64") {
|
||||
std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
|
||||
std::string data = json_value(source, "data", std::string());
|
||||
std::ostringstream ss;
|
||||
ss << "data:" << media_type << ";base64," << data;
|
||||
|
||||
converted_content.push_back({
|
||||
{"type", "image_url"},
|
||||
{"image_url", {
|
||||
{"url", ss.str()}
|
||||
}}
|
||||
});
|
||||
} else if (source_type == "url") {
|
||||
std::string url = json_value(source, "url", std::string());
|
||||
converted_content.push_back({
|
||||
{"type", "image_url"},
|
||||
{"image_url", {
|
||||
{"url", url}
|
||||
}}
|
||||
});
|
||||
}
|
||||
} else if (type == "tool_use") {
|
||||
tool_calls.push_back({
|
||||
{"id", json_value(block, "id", std::string())},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", json_value(block, "name", std::string())},
|
||||
{"arguments", json_value(block, "input", json::object()).dump()}
|
||||
}}
|
||||
});
|
||||
has_tool_calls = true;
|
||||
} else if (type == "tool_result") {
|
||||
std::string tool_use_id = json_value(block, "tool_use_id", std::string());
|
||||
|
||||
auto result_content = json_value(block, "content", json());
|
||||
std::string result_text;
|
||||
if (result_content.is_string()) {
|
||||
result_text = result_content.get<std::string>();
|
||||
} else if (result_content.is_array()) {
|
||||
for (const auto & c : result_content) {
|
||||
if (json_value(c, "type", std::string()) == "text") {
|
||||
result_text += json_value(c, "text", std::string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tool_results.push_back({
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", tool_use_id},
|
||||
{"content", result_text}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (!converted_content.empty() || has_tool_calls || !reasoning_content.empty()) {
|
||||
json new_msg = {{"role", role}};
|
||||
if (!converted_content.empty()) {
|
||||
new_msg["content"] = converted_content;
|
||||
} else if (has_tool_calls || !reasoning_content.empty()) {
|
||||
new_msg["content"] = "";
|
||||
}
|
||||
if (!tool_calls.empty()) {
|
||||
new_msg["tool_calls"] = tool_calls;
|
||||
}
|
||||
if (!reasoning_content.empty()) {
|
||||
new_msg["reasoning_content"] = reasoning_content;
|
||||
}
|
||||
oai_messages.push_back(new_msg);
|
||||
}
|
||||
|
||||
for (const auto & tool_msg : tool_results) {
|
||||
oai_messages.push_back(tool_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
oai_body["messages"] = oai_messages;
|
||||
|
||||
// Convert tools
|
||||
if (body.contains("tools")) {
|
||||
const json & tools = body.at("tools");
|
||||
if (tools.is_array()) {
|
||||
json oai_tools = json::array();
|
||||
for (const auto & tool : tools) {
|
||||
oai_tools.push_back({
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", json_value(tool, "name", std::string())},
|
||||
{"description", json_value(tool, "description", std::string())},
|
||||
{"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
|
||||
}}
|
||||
});
|
||||
}
|
||||
oai_body["tools"] = oai_tools;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert tool_choice
|
||||
if (body.contains("tool_choice")) {
|
||||
const json & tc = body.at("tool_choice");
|
||||
if (tc.is_object()) {
|
||||
std::string type = json_value(tc, "type", std::string());
|
||||
if (type == "auto") {
|
||||
oai_body["tool_choice"] = "auto";
|
||||
} else if (type == "any" || type == "tool") {
|
||||
oai_body["tool_choice"] = "required";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert stop_sequences to stop
|
||||
if (body.contains("stop_sequences")) {
|
||||
oai_body["stop"] = body.at("stop_sequences");
|
||||
}
|
||||
|
||||
// Handle max_tokens (required in Anthropic, but we're permissive)
|
||||
if (body.contains("max_tokens")) {
|
||||
oai_body["max_tokens"] = body.at("max_tokens");
|
||||
} else {
|
||||
oai_body["max_tokens"] = 4096;
|
||||
}
|
||||
|
||||
// Pass through common params
|
||||
for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
|
||||
if (body.contains(key)) {
|
||||
oai_body[key] = body.at(key);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle Anthropic-specific thinking param
|
||||
if (body.contains("thinking")) {
|
||||
json thinking = json_value(body, "thinking", json::object());
|
||||
std::string thinking_type = json_value(thinking, "type", std::string());
|
||||
if (thinking_type == "enabled") {
|
||||
int budget_tokens = json_value(thinking, "budget_tokens", 10000);
|
||||
oai_body["thinking_budget_tokens"] = budget_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle Anthropic-specific metadata param
|
||||
if (body.contains("metadata")) {
|
||||
json metadata = json_value(body, "metadata", json::object());
|
||||
std::string user_id = json_value(metadata, "user_id", std::string());
|
||||
if (!user_id.empty()) {
|
||||
oai_body["__metadata_user_id"] = user_id;
|
||||
}
|
||||
}
|
||||
|
||||
return oai_body;
|
||||
}
|
||||
|
||||
json format_embeddings_response_oaicompat(
|
||||
const json & request,
|
||||
const std::string & model_name,
|
||||
|
|
|
|||
|
|
@ -307,18 +307,6 @@ json oaicompat_chat_params_parse(
|
|||
const server_chat_params & opt,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||
json convert_responses_to_chatcmpl(const json & body);
|
||||
|
||||
// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
|
||||
json convert_transcriptions_to_chatcmpl(
|
||||
const json & body,
|
||||
const std::map<std::string, raw_buffer> & in_files,
|
||||
std::vector<raw_buffer> & out_files);
|
||||
|
||||
// convert Anthropic Messages API format to OpenAI Chat Completions API format
|
||||
json convert_anthropic_to_oai(const json & body);
|
||||
|
||||
// TODO: move it to server-task.cpp
|
||||
json format_embeddings_response_oaicompat(
|
||||
const json & request,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
|
||||
#include "server-context.h"
|
||||
#include "server-chat.h"
|
||||
#include "server-common.h"
|
||||
#include "server-http.h"
|
||||
#include "server-task.h"
|
||||
|
|
@ -1044,8 +1045,8 @@ private:
|
|||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* reasoning_budget */ params_base.reasoning_budget,
|
||||
/* reasoning_budget_msg */ params_base.reasoning_budget_message,
|
||||
/* reasoning_budget */ params_base.sampling.reasoning_budget_tokens,
|
||||
/* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message,
|
||||
/* media_path */ params_base.media_path,
|
||||
/* force_pure_content */ params_base.force_pure_content_parser
|
||||
};
|
||||
|
|
@ -2960,7 +2961,13 @@ private:
|
|||
|
||||
// verify and try to accept the draft
|
||||
{
|
||||
common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
|
||||
const bool use_ckpt = slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
|
||||
|
||||
// only save the sampler sampler state if we use checkpoints
|
||||
common_sampler_ptr smpl_save;
|
||||
if (use_ckpt) {
|
||||
smpl_save.reset(common_sampler_clone(slot.smpl.get()));
|
||||
}
|
||||
|
||||
GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
|
||||
auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
|
||||
|
|
@ -2972,7 +2979,7 @@ private:
|
|||
|
||||
// check for partial draft acceptance
|
||||
if (accepted.size() < slot.spec_draft.size() + 1) {
|
||||
if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
|
||||
if (use_ckpt) {
|
||||
// partial acceptance is not supported by the context -> truncate the draft and restore the state
|
||||
slot.spec_draft = std::move(accepted);
|
||||
|
||||
|
|
@ -3774,7 +3781,7 @@ void server_routes::init_routes() {
|
|||
this->post_responses_oai = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = convert_responses_to_chatcmpl(json::parse(req.body));
|
||||
json body = server_chat_convert_responses_to_chatcmpl(json::parse(req.body));
|
||||
SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions");
|
||||
SRV_DBG("converted request: %s\n", body.dump().c_str());
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
|
|
@ -3819,7 +3826,7 @@ void server_routes::init_routes() {
|
|||
this->post_anthropic_messages = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
|
||||
SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
|
||||
SRV_DBG("converted request: %s\n", body.dump().c_str());
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
|
|
@ -3837,7 +3844,7 @@ void server_routes::init_routes() {
|
|||
this->post_anthropic_count_tokens = [this](const server_http_req & req) {
|
||||
auto res = create_response();
|
||||
std::vector<raw_buffer> files;
|
||||
json body = convert_anthropic_to_oai(json::parse(req.body));
|
||||
json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
|
||||
SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
|
||||
SRV_DBG("converted request: %s\n", body.dump().c_str());
|
||||
json body_parsed = oaicompat_chat_params_parse(
|
||||
|
|
|
|||
|
|
@ -712,6 +712,11 @@ void server_models::unload(const std::string & name) {
|
|||
if (it->second.meta.is_running()) {
|
||||
SRV_INF("stopping model instance name=%s\n", name.c_str());
|
||||
stopping_models.insert(name);
|
||||
if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) {
|
||||
// special case: if model is in loading state, unloading means force-killing it
|
||||
SRV_WRN("model name=%s is still loading, force-killing\n", name.c_str());
|
||||
subprocess_terminate(it->second.subproc.get());
|
||||
}
|
||||
cv_stop.notify_all();
|
||||
// status change will be handled by the managing thread
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "server-task.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "server-chat.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
|
@ -873,7 +874,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
|||
json {
|
||||
{"finish_reason", nullptr},
|
||||
{"index", index},
|
||||
{"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
|
||||
{"delta", server_chat_msg_diff_to_json_oaicompat(diff)},
|
||||
},
|
||||
})},
|
||||
{"created", t},
|
||||
|
|
@ -1110,7 +1111,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
|||
json server_task_result_cmpl_final::to_json_oaicompat_asr() {
|
||||
json event = json {
|
||||
{"type", "transcript.text.done"},
|
||||
{"text", content},
|
||||
{"text", oaicompat_msg.content},
|
||||
{"usage", json {
|
||||
{"type", "tokens"},
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
|
|
@ -1522,7 +1523,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
|
|||
}
|
||||
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
|
||||
add_delta(server_chat_msg_diff_to_json_oaicompat(diff));
|
||||
}
|
||||
|
||||
if (!deltas.empty()) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue