mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-18 23:49:46 +00:00
note: smartcache is broken for rnn currently
This commit is contained in:
commit
f3d2f58fa8
30 changed files with 2232 additions and 875 deletions
138
.devops/openvino.Dockerfile
Normal file
138
.devops/openvino.Dockerfile
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
ARG OPENVINO_VERSION_MAJOR=2026.0
|
||||
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Optional proxy build arguments - empty by default
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
## Build Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
# Pass proxy args to build stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
gnupg \
|
||||
wget \
|
||||
git \
|
||||
cmake \
|
||||
ninja-build \
|
||||
build-essential \
|
||||
libtbb12 \
|
||||
libssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
opencl-headers \
|
||||
opencl-clhpp-headers \
|
||||
intel-opencl-icd && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install OpenVINO for Ubuntu 24.04
|
||||
ARG OPENVINO_VERSION_MAJOR
|
||||
ARG OPENVINO_VERSION_FULL
|
||||
RUN mkdir -p /opt/intel && \
|
||||
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
|
||||
cd - && \
|
||||
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
|
||||
|
||||
ENV OpenVINO_DIR=/opt/intel/openvino
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Build Stage
|
||||
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON && \
|
||||
cmake --build build/ReleaseOV -j$(nproc)"
|
||||
|
||||
# Copy all necessary libraries
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
|
||||
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
|
||||
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
|
||||
|
||||
# Create runtime directories and copy binaries
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base Runtime Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
# Pass proxy args to runtime stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app/
|
||||
|
||||
### Full (all binaries)
|
||||
FROM base AS full
|
||||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
COPY --from=build /app/full /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
python3 \
|
||||
python3-venv \
|
||||
python3-pip && \
|
||||
python3 -m venv /ov-venv && \
|
||||
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
|
||||
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include <stdexcept>
|
||||
|
|
@ -182,7 +183,10 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
|||
case tool_format::TAG_WITH_TAGGED:
|
||||
return build_tool_parser_tag_tagged(ctx);
|
||||
default:
|
||||
GGML_ABORT("Unable to create tool parser");
|
||||
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||
"report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
|
||||
return ctx.p.eps();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
37
ggml/include/ggml-openvino.h
Normal file
37
ggml/include/ggml-openvino.h
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_OPENVINO_NAME "OPENVINO"
|
||||
|
||||
// backend API
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -82,6 +82,10 @@
|
|||
#include "ggml-zendnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
#include "ggml-openvino.h"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static std::string path_str(const fs::path & path) {
|
||||
|
|
@ -154,6 +158,9 @@ struct ggml_backend_registry {
|
|||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENVINO
|
||||
register_backend(ggml_backend_openvino_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
|
|
@ -558,6 +565,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
|||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("hexagon", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("openvino", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
|
||||
const char * backend_path = std::getenv("GGML_BACKEND_PATH");
|
||||
|
|
|
|||
|
|
@ -175,13 +175,6 @@
|
|||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -9624,7 +9624,7 @@ void ggml_compute_forward_win_unpart(
|
|||
}
|
||||
}
|
||||
|
||||
//gmml_compute_forward_unary
|
||||
//ggml_compute_forward_unary
|
||||
|
||||
void ggml_compute_forward_unary(
|
||||
const ggml_compute_params * params,
|
||||
|
|
@ -10477,34 +10477,40 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
|||
const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
|
||||
const float * g_d = (const float *)((const char *)src_g->data + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);
|
||||
|
||||
// state is stored transposed: s_out[j*S_v + i] = S[i][j]
|
||||
// so row j of s_out = column j of S (contiguous access)
|
||||
|
||||
if (kda) {
|
||||
// precompute exp(g) into delta scratch (reused below)
|
||||
for (int64_t i = 0; i < S_v; ++i) {
|
||||
ggml_vec_scale_f32(S_v, &s_out[i * S_v], expf(g_d[i]));
|
||||
delta[i] = expf(g_d[i]);
|
||||
}
|
||||
// S[i][:] *= exp(g[i]) => for each row j of M: M[j][i] *= exp(g[i])
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
ggml_vec_mul_f32(S_v, &s_out[j * S_v], &s_out[j * S_v], delta);
|
||||
}
|
||||
} else {
|
||||
ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
|
||||
}
|
||||
|
||||
// delta[j] = sum_i S[j][i] * k[i]
|
||||
memset(delta, 0, S_v * sizeof(float));
|
||||
for (int64_t i = 0; i < S_v; ++i) {
|
||||
ggml_vec_mad_f32(S_v, delta, &s_out[i * S_v], k_d[i]);
|
||||
}
|
||||
// delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
delta[j] = (v_d[j] - delta[j]) * beta_val;
|
||||
float sum = 0.0f;
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
|
||||
delta[j] = (v_d[j] - sum) * beta_val;
|
||||
}
|
||||
|
||||
// outer product: S[j][i] += k[i] * delta[j]
|
||||
for (int64_t i = 0; i < S_v; ++i) {
|
||||
ggml_vec_mad_f32(S_v, &s_out[i * S_v], delta, k_d[i]);
|
||||
// outer product: S[i][j] += k[i] * delta[j] => M[j][i] += delta[j] * k[i]
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
ggml_vec_mad_f32(S_v, &s_out[j * S_v], k_d, delta[j]);
|
||||
}
|
||||
|
||||
// attn_out[j] = sum_i S[j][i] * q[i]
|
||||
memset(attn_data, 0, S_v * sizeof(float));
|
||||
for (int64_t i = 0; i < S_v; ++i) {
|
||||
ggml_vec_mad_f32(S_v, attn_data, &s_out[i * S_v], q_d[i]);
|
||||
// attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
float sum = 0.0f;
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
|
||||
attn_data[j] = sum * scale;
|
||||
}
|
||||
ggml_vec_scale_f32(S_v, attn_data, scale);
|
||||
|
||||
attn_data += S_v * H; // advance to next token
|
||||
}
|
||||
|
|
|
|||
|
|
@ -479,13 +479,51 @@ do { \
|
|||
|
||||
// F16 AVX512
|
||||
|
||||
// F16 AVX
|
||||
#if defined(__AVX512FP16__)
|
||||
|
||||
#define GGML_F16_STEP 128
|
||||
#define GGML_F16_EPR 32
|
||||
|
||||
#define GGML_F16x32 __m512h
|
||||
#define GGML_F16x32_ZERO _mm512_setzero_ph()
|
||||
#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x))
|
||||
#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x)
|
||||
#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y)
|
||||
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
|
||||
#define GGML_F16x32_ADD _mm512_add_ph
|
||||
#define GGML_F16x32_MUL _mm512_mul_ph
|
||||
#define GGML_F16x32_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = (ggml_float) _mm512_reduce_add_ph(x[0]); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x32
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x32_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x32_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x32_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x32_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE
|
||||
|
||||
#else // Fallback FP16 <-> FP32
|
||||
|
||||
#define GGML_F16_STEP 64
|
||||
#define GGML_F16_EPR 16
|
||||
|
||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||
|
||||
#define GGML_F32Cx16 __m512
|
||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||
|
|
@ -525,6 +563,8 @@ do { \
|
|||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||
|
||||
#endif // __AVX512FP16__
|
||||
#elif defined(__AVX__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
|
|
|||
|
|
@ -56,7 +56,8 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
|||
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
|
||||
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
__shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
int cur_tile_buf = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
||||
|
|
@ -70,7 +71,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
|||
if(x < ne01 && y + j < ne00){
|
||||
const int row = threadIdx.y+j;
|
||||
const int col = threadIdx.x * sizeof(float)/sizeof(T);
|
||||
T *tile2 = reinterpret_cast<T*>(tile[row]);
|
||||
T *tile2 = reinterpret_cast<T*>(tile[cur_tile_buf][row]);
|
||||
tile2[col] = src[imat*n + (y+j)*ne01 + x];
|
||||
}
|
||||
}
|
||||
|
|
@ -81,10 +82,12 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
|
|||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||
if (ty + j < ne01 && tx < ne00) {
|
||||
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
|
||||
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
|
||||
const T *tile2 = reinterpret_cast<const T*>(tile[cur_tile_buf][threadIdx.x]);
|
||||
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
|
||||
}
|
||||
}
|
||||
|
||||
cur_tile_buf = (cur_tile_buf + 1) % 2;
|
||||
}
|
||||
|
||||
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
|
||||
|
|
|
|||
|
|
@ -45,10 +45,11 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
|
||||
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
|
||||
float s_shard[rows_per_lane];
|
||||
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = curr_state[i * S_v + col];
|
||||
s_shard[r] = curr_state[col * S_v + i];
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
|
|
@ -126,23 +127,14 @@ __global__ void gated_delta_net_cuda(const float * q,
|
|||
attn_data += S_v * H;
|
||||
}
|
||||
|
||||
// Write state back to global memory
|
||||
// Write state back to global memory (transposed layout)
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
state[i * S_v + col] = s_shard[r];
|
||||
state[col * S_v + i] = s_shard[r];
|
||||
}
|
||||
}
|
||||
|
||||
static size_t calculate_smem(const int sv, int cc)
|
||||
{
|
||||
size_t smem = 0;
|
||||
if ((GGML_CUDA_CC_IS_AMD(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_RDNA4(cc)) || GGML_CUDA_CC_IS_MTHREADS(cc)) {
|
||||
smem = sv * sv * sizeof(float);
|
||||
}
|
||||
return smem;
|
||||
}
|
||||
|
||||
template <bool KDA>
|
||||
static void launch_gated_delta_net(
|
||||
const float * q_d, const float * k_d, const float * v_d,
|
||||
|
|
@ -179,18 +171,14 @@ static void launch_gated_delta_net(
|
|||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||
break;
|
||||
case 64: {
|
||||
constexpr int sv = 64;
|
||||
size_t smem = calculate_smem(sv, cc);
|
||||
gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
|
||||
gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||
break;
|
||||
}
|
||||
case 128: {
|
||||
constexpr int sv = 128;
|
||||
size_t smem = calculate_smem(sv, cc);
|
||||
gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
|
||||
gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
|
||||
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
|
||||
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
|
||||
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
|
||||
|
|
|
|||
|
|
@ -1242,6 +1242,34 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|||
}
|
||||
}
|
||||
|
||||
struct cublas_force_compute_type {
|
||||
bool fp32 = false;
|
||||
bool fp16 = false;
|
||||
};
|
||||
|
||||
static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type() {
|
||||
static const cublas_force_compute_type compute_type = [] {
|
||||
cublas_force_compute_type result;
|
||||
|
||||
const bool ggml_cuda_force_cublas_compute_32f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F") != nullptr;
|
||||
const bool ggml_cuda_force_cublas_compute_16f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F") != nullptr;
|
||||
|
||||
GGML_ASSERT(ggml_cuda_force_cublas_compute_16f_env == false || ggml_cuda_force_cublas_compute_32f_env == false);
|
||||
|
||||
if (ggml_cuda_force_cublas_compute_32f_env) {
|
||||
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F\n");
|
||||
result.fp32 = true;
|
||||
} else if (ggml_cuda_force_cublas_compute_16f_env) {
|
||||
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F\n");
|
||||
result.fp16 = true;
|
||||
}
|
||||
|
||||
return result;
|
||||
}();
|
||||
|
||||
return compute_type;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_mul_mat_cublas(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
|
|
@ -1324,7 +1352,13 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||
|
||||
if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||
|| GGML_CUDA_CC_IS_RDNA4(cc)
|
||||
|| cc == GGML_CUDA_CC_VOLTA
|
||||
|| force_compute_type.fp32))
|
||||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
CUBLAS_CHECK(
|
||||
|
|
@ -1923,10 +1957,23 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|||
cudaDataType_t cu_data_type_b = traits::data_type;
|
||||
const void * alpha = traits::get_alpha();
|
||||
const void * beta = traits::get_beta();
|
||||
const float alpha_f32 = 1.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
static constexpr bool is_src0_type_f16 = src0_type == GGML_TYPE_F16;
|
||||
|
||||
// bf16 and fp32 are already being computed in fp32 (ensure it using static_assert),
|
||||
// so checking necessity of forced fp32 only for fp16 src0_type
|
||||
static_assert(is_src0_type_f16 || traits::compute_type == CUBLAS_COMPUTE_32F);
|
||||
|
||||
const bool need_compute_32f = is_src0_type_f16 && !force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|
||||
|| GGML_CUDA_CC_IS_RDNA4(cc)
|
||||
|| cc == GGML_CUDA_CC_VOLTA
|
||||
|| force_compute_type.fp32);
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT && !need_compute_32f) {
|
||||
if constexpr (src0_type == GGML_TYPE_F32) {
|
||||
dst_t = (char *) dst_ddf; // Direct F32 output
|
||||
} else {
|
||||
|
|
@ -1936,18 +1983,10 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
|||
}
|
||||
} else {
|
||||
dst_t = (char *) dst_ddf;
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
cu_data_type = CUDA_R_32F;
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
cu_compute_type = batched_mul_mat_traits<GGML_TYPE_F32>::compute_type;
|
||||
cu_data_type = batched_mul_mat_traits<GGML_TYPE_F32>::data_type;
|
||||
alpha = batched_mul_mat_traits<GGML_TYPE_F32>::get_alpha();
|
||||
beta = batched_mul_mat_traits<GGML_TYPE_F32>::get_beta();
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
|
|
|
|||
|
|
@ -2469,13 +2469,14 @@ kernel void kernel_gated_delta_net_impl(
|
|||
|
||||
const float scale = 1.0f / sqrt((float)S_v);
|
||||
|
||||
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||
// state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
|
||||
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
|
||||
float ls[NSG];
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
ls[j] = s_ptr[is*S_v];
|
||||
ls[j] = s_ptr[is];
|
||||
}
|
||||
|
||||
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
|
||||
|
|
@ -2536,11 +2537,11 @@ kernel void kernel_gated_delta_net_impl(
|
|||
g_ptr += args.ne21*G;
|
||||
}
|
||||
|
||||
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
dst_state[is*S_v] = ls[j];
|
||||
dst_state[is] = ls[j];
|
||||
}
|
||||
|
||||
#undef S_v
|
||||
|
|
|
|||
|
|
@ -1,139 +0,0 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
// max workgroup size is usually 1024, this covers various subgroups sizes
|
||||
#define MAX_SUBGROUPS 128
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_cumsum_blk(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * tmp,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
uint net0,
|
||||
uint net1,
|
||||
uint net2
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int nth = get_local_size(0);
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
const int ib = i1 / ne01;
|
||||
const int i00 = ib * nth;
|
||||
const int i01 = i1 % ne01;
|
||||
const int i02 = i2;
|
||||
const int i03 = i3;
|
||||
|
||||
global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float * tmp_row = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
|
||||
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
__local float partial[MAX_SUBGROUPS];
|
||||
|
||||
float v = 0.0f;
|
||||
if (i00 + tid < ne00) {
|
||||
v = src0_row[i00 + tid];
|
||||
}
|
||||
|
||||
float s = sub_group_scan_inclusive_add(v);
|
||||
if (sg_lid == sg_size - 1) {
|
||||
partial[sg_id] = s;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// NB: subgroup size should be larger than number of subgroups
|
||||
// assuming max workgroup size of 1024, subgroup size should be >= 32
|
||||
if (sg_id == 0) {
|
||||
float x = 0.0f;
|
||||
if (sg_lid < get_num_sub_groups()) {
|
||||
x = partial[sg_lid];
|
||||
}
|
||||
float ex = sub_group_scan_exclusive_add(x);
|
||||
if (sg_lid < get_num_sub_groups()) {
|
||||
partial[sg_lid] = ex;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s += partial[sg_id];
|
||||
|
||||
if (i00 + tid < ne00) {
|
||||
dst_row[i00 + tid] = s;
|
||||
}
|
||||
if (ne00 > nth && tid == nth - 1) {
|
||||
tmp_row[ib] = s;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_cumsum_add(
|
||||
global char * tmp,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
uint nbt0,
|
||||
uint nbt1,
|
||||
uint nbt2,
|
||||
uint nbt3
|
||||
) {
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int nth = get_local_size(0);
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const int ib = i1 / ne01;
|
||||
if (ib == 0) {
|
||||
return;
|
||||
}
|
||||
const int i00 = ib * nth;
|
||||
const int i01 = i1 % ne01;
|
||||
const int i02 = i2;
|
||||
const int i03 = i3;
|
||||
|
||||
global float * tmp_row = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
|
||||
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
|
||||
if (i00 + tid < ne00) {
|
||||
dst_row[i00 + tid] += tmp_row[ib - 1];
|
||||
}
|
||||
}
|
||||
|
|
@ -44,7 +44,7 @@ void main() {
|
|||
|
||||
FLOAT_TYPE state[S_V];
|
||||
[[unroll]] for (uint i = 0; i < S_V; i++) {
|
||||
state[i] = FLOAT_TYPE(data_state[state_base + i * S_V + col]);
|
||||
state[i] = FLOAT_TYPE(data_state[state_base + col * S_V + i]);
|
||||
}
|
||||
|
||||
uint attn_off = (seq_id * n_tokens * H + head_id) * S_V;
|
||||
|
|
@ -123,6 +123,6 @@ void main() {
|
|||
}
|
||||
|
||||
[[unroll]] for (uint i = 0; i < S_V; i++) {
|
||||
data_dst[s_off + state_base + i * S_V + col] = state[i];
|
||||
data_dst[s_off + state_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,67 +0,0 @@
|
|||
enable f16;
|
||||
|
||||
struct Params {
|
||||
ne: u32,
|
||||
|
||||
offset_src0: u32,
|
||||
offset_dst: u32,
|
||||
|
||||
stride_src0_0: u32,
|
||||
stride_src0_1: u32,
|
||||
stride_src0_2: u32,
|
||||
stride_src0_3: u32,
|
||||
|
||||
a_ne0: u32,
|
||||
a_ne1: u32,
|
||||
a_ne2: u32,
|
||||
a_ne3: u32,
|
||||
|
||||
ne0: u32,
|
||||
ne1: u32,
|
||||
ne2: u32,
|
||||
};
|
||||
|
||||
#ifdef TYPE_F32
|
||||
#define DataType f32
|
||||
#endif
|
||||
#ifdef TYPE_I32
|
||||
#define DataType i32
|
||||
#endif
|
||||
#ifdef TYPE_I16
|
||||
// same size (16-bit) is sufficient for repeat
|
||||
#define DataType f16
|
||||
#endif
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<DataType>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> dst: array<DataType>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
|
||||
i = i % (params.ne2 * params.ne1 * params.ne0);
|
||||
let i2 = i / (params.ne1 * params.ne0);
|
||||
i = i % (params.ne1 * params.ne0);
|
||||
let i1 = i / params.ne0;
|
||||
let i0 = i % params.ne0;
|
||||
|
||||
let a_i0 = i0 % params.a_ne0;
|
||||
let a_i1 = i1 % params.a_ne1;
|
||||
let a_i2 = i2 % params.a_ne2;
|
||||
let a_i3 = i3 % params.a_ne3;
|
||||
|
||||
let a_index = a_i0 * params.stride_src0_0 +
|
||||
a_i1 * params.stride_src0_1 +
|
||||
a_i2 * params.stride_src0_2 +
|
||||
a_i3 * params.stride_src0_3;
|
||||
|
||||
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + a_index];
|
||||
}
|
||||
}
|
||||
|
|
@ -521,7 +521,12 @@ void llama_context::sched_reserve() {
|
|||
|
||||
if (cparams.fused_gdn_ch) {
|
||||
// more than one token in the batch per sequence in order to take the chunked path
|
||||
auto * gf = graph_reserve(16*n_seqs, n_seqs, n_outputs, mctx.get(), true);
|
||||
// note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
|
||||
// because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
|
||||
// it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
|
||||
// the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
|
||||
const uint32_t n_tokens_ch = 16*n_seqs;
|
||||
auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
|
||||
cb(kg_t, "key_gdiff_t", il);
|
||||
|
||||
ggml_tensor * s_t = ggml_transpose(ctx0, s);
|
||||
s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
|
||||
cb(s_t, "dnet_add_ch_state", il);
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
|
||||
// [CS, S_v, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
||||
|
|
@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
|
||||
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
|
||||
cb(v_t_p, "v_prime", il);
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
|
|
@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
cb(v_attn, "v_attn", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
|
||||
cb(attn_inter, "attn_inter", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
|
|
@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
|
||||
|
||||
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
|
||||
s_t = ggml_add(ctx0, s_t, kgv);
|
||||
cb(s_t, "dnet_add_ch_state", il);
|
||||
s = ggml_mul(ctx0, s, ch_g_last_exp_t);
|
||||
s = ggml_add(ctx0, s, kgv);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
}
|
||||
|
||||
s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * o = ggml_view_4d(ctx0, v,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
|
|
@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_row_size(v->type, S_v * CS * n_chunks),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
|
||||
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t);
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
|
||||
cb(s, "output_state", il);
|
||||
|
||||
return {o, s};
|
||||
|
|
@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
g = ggml_exp(ctx0, g);
|
||||
s = ggml_mul(ctx0, s, g);
|
||||
|
||||
ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
|
||||
|
||||
// [1, S_v, H_v, n_seqs]
|
||||
ggml_tensor * sk;
|
||||
sk = ggml_mul (ctx0, s_t, k);
|
||||
sk = ggml_mul (ctx0, s, k);
|
||||
sk = ggml_sum_rows(ctx0, sk);
|
||||
|
||||
// [S_v, 1, H_v, n_seqs]
|
||||
|
|
@ -362,15 +357,14 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
k = ggml_repeat(ctx0, k, s);
|
||||
kd = ggml_mul (ctx0, k, d_t);
|
||||
|
||||
s_t = ggml_add(ctx0, s_t, kd);
|
||||
s = ggml_add(ctx0, s, kd);
|
||||
|
||||
cb(s_t, "dnet_add_ar_state", il);
|
||||
cb(s, "dnet_add_ar_state", il);
|
||||
|
||||
ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
|
||||
ggml_tensor * s_q = ggml_mul (ctx0, s, q);
|
||||
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
|
||||
|
||||
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
|
||||
|
||||
return {o, s};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -579,10 +579,9 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
|
||||
projector_type clip_get_projector_type(const struct clip_ctx * ctx);
|
||||
void clip_set_debug_output_embeddings(struct clip_ctx * ctx, bool debug);
|
||||
|
|
|
|||
|
|
@ -209,6 +209,8 @@ struct clip_ctx {
|
|||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
bool debug_output_embeddings = false;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
backend_cpu = ggml_backend_cpu_init(); //always has CPU backend
|
||||
|
|
@ -255,6 +257,8 @@ struct clip_ctx {
|
|||
if (ctx_params.cb_eval != nullptr) {
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
|
||||
}
|
||||
|
||||
debug_output_embeddings = std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr;
|
||||
}
|
||||
|
||||
~clip_ctx() {
|
||||
|
|
@ -2277,8 +2281,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
// TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
|
||||
// we can remove this check when we implement audio support for Gemma 3N
|
||||
skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
|
||||
|
||||
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
|
||||
}
|
||||
|
||||
if (loader.has_audio && !skip_audio) {
|
||||
|
|
@ -4179,7 +4181,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
|
||||
// Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
|
||||
if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
|
||||
if (ctx->debug_output_embeddings) {
|
||||
const int64_t n_embd = embeddings->ne[0];
|
||||
const int64_t n_tokens = embeddings->ne[1];
|
||||
std::vector<float> emb_data(n_embd * n_tokens);
|
||||
|
|
@ -4586,14 +4588,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
|||
//
|
||||
// API for debugging
|
||||
//
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
|
||||
clip_image_f32 img;
|
||||
img.nx = w;
|
||||
img.ny = h;
|
||||
img.buf.resize(h * w * 3);
|
||||
for (int i = 0; i < h * w * 3; i++) {
|
||||
img.buf[i] = static_cast<float>(fill_value);
|
||||
}
|
||||
clip_image_encode(ctx, 1, &img, nullptr);
|
||||
GGML_ASSERT(img.buf.empty() && "expected, always stop here");
|
||||
|
||||
void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) {
|
||||
ctx->debug_output_embeddings = enable;
|
||||
}
|
||||
|
|
|
|||
229
tools/mtmd/debug/mtmd-debug.cpp
Normal file
229
tools/mtmd/debug/mtmd-debug.cpp
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
#include "mtmd-debug.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "debug.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <limits.h>
|
||||
#include <cinttypes>
|
||||
#include <clocale>
|
||||
|
||||
// INTERNAL TOOL FOR DEBUGGING PURPOSES ONLY
|
||||
// NOT INTENDED FOR PUBLIC USE
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Internal debugging tool for mtmd; See mtmd-debug.md for the pytorch equivalent code\n"
|
||||
"Note: we repurpose some args from other examples, they will have different meaning here\n"
|
||||
"\n"
|
||||
"Usage: %s -m <model> --mmproj <mmproj> -p <mode> -n <size> --image <image> --audio <audio>\n"
|
||||
"\n"
|
||||
" -n <size>: number of pixels per edge for image (always square image), or number of samples for audio\n"
|
||||
"\n"
|
||||
" -p \"encode\" (debugging encode pass, default case):\n"
|
||||
" --image can be:\n"
|
||||
" \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n"
|
||||
" \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n"
|
||||
" --audio can be:\n"
|
||||
" \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
|
||||
" \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n"
|
||||
"\n"
|
||||
" -p \"preproc\" (debugging preprocessing pass):\n"
|
||||
" --image can be:\n"
|
||||
" \"white\", \"black\", \"gray\": filled image with respective colors\n"
|
||||
" \"cb\": checkerboard pattern\n"
|
||||
" --audio can be:\n"
|
||||
" \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
|
||||
" \"440\": sine wave with 440 Hz frequency\n"
|
||||
"\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
LOG_ERR("ERR: Missing --mmproj argument\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
mtmd::context_ptr ctx_mtmd;
|
||||
common_init_result_ptr llama_init;
|
||||
base_callback_data cb_data;
|
||||
|
||||
llama_init = common_init_from_params(params);
|
||||
{
|
||||
auto * model = llama_init->model();
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = params.mmproj_use_gpu;
|
||||
mparams.print_timings = true;
|
||||
mparams.n_threads = params.cpuparams.n_threads;
|
||||
mparams.flash_attn_type = params.flash_attn_type;
|
||||
mparams.warmup = params.warmup;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
{
|
||||
// always enable debug callback
|
||||
mparams.cb_eval_user_data = &cb_data;
|
||||
mparams.cb_eval = common_debug_cb_eval<false>;
|
||||
}
|
||||
ctx_mtmd.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_mtmd.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
std::string input;
|
||||
int32_t inp_size = params.n_predict;
|
||||
if (params.image.empty()) {
|
||||
LOG_ERR("ERR: At least one of --image or --audio must be specified\n");
|
||||
return 1;
|
||||
}
|
||||
if (inp_size <= 0) {
|
||||
LOG_ERR("ERR: Invalid size specified with -n, must be greater than 0\n");
|
||||
return 1;
|
||||
}
|
||||
input = params.image[0];
|
||||
|
||||
if (params.prompt.empty() || params.prompt == "encode") {
|
||||
std::vector<std::vector<float>> image;
|
||||
std::vector<float> samples;
|
||||
|
||||
if (input == "black") {
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
auto row = std::vector<float>(inp_size * 3, 0.0f);
|
||||
image.push_back(row);
|
||||
}
|
||||
} else if (input == "white") {
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
auto row = std::vector<float>(inp_size * 3, 1.0f);
|
||||
image.push_back(row);
|
||||
}
|
||||
} else if (input == "gray") {
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
auto row = std::vector<float>(inp_size * 3, 0.5f);
|
||||
image.push_back(row);
|
||||
}
|
||||
} else if (input == "cb") {
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
auto row = std::vector<float>(inp_size * 3, 0.0f);
|
||||
image.push_back(row);
|
||||
}
|
||||
for (int y = 0; y < inp_size; ++y) {
|
||||
for (int x = 0; x < inp_size; ++x) {
|
||||
float v = ((x + y) % 2) ? 0.0f : 1.0f;
|
||||
image[y][x * 3 + 0] = v;
|
||||
image[y][x * 3 + 1] = v;
|
||||
image[y][x * 3 + 2] = v;
|
||||
}
|
||||
}
|
||||
} else if (input == "one") {
|
||||
samples = std::vector<float>(inp_size, 1.0f);
|
||||
} else if (input == "zero") {
|
||||
samples = std::vector<float>(inp_size, 0.0f);
|
||||
} else if (input == "half") {
|
||||
samples = std::vector<float>(inp_size, 0.5f);
|
||||
} else if (input == "1010") {
|
||||
samples.resize(inp_size);
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
samples[i] = (i % 2) ? 0.0f : 1.0f;
|
||||
}
|
||||
} else {
|
||||
LOG_ERR("ERR: Invalid input specified with --image/--audio\n");
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// run encode pass
|
||||
LOG_INF("Running encode pass for input type: %s\n", input.c_str());
|
||||
if (samples.size() > 0) {
|
||||
LOG_INF("Input audio with %zu samples, type: %s\n", samples.size(), input.c_str());
|
||||
mtmd_debug_encode_audio(ctx_mtmd.get(), samples);
|
||||
} else {
|
||||
LOG_INF("Input image with dimensions %d x %d, type: %s\n", inp_size, inp_size, input.c_str());
|
||||
mtmd_debug_encode_image(ctx_mtmd.get(), image);
|
||||
}
|
||||
|
||||
} else if (params.prompt == "preproc") {
|
||||
std::vector<uint8_t> rgb_values;
|
||||
std::vector<float> pcm_samples;
|
||||
|
||||
if (input == "black") {
|
||||
rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 0);
|
||||
} else if (input == "white") {
|
||||
rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 255);
|
||||
} else if (input == "gray") {
|
||||
rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 128);
|
||||
} else if (input == "cb") {
|
||||
rgb_values.resize(inp_size * inp_size * 3);
|
||||
for (int y = 0; y < inp_size; ++y) {
|
||||
for (int x = 0; x < inp_size; ++x) {
|
||||
uint8_t v = ((x + y) % 2) ? 0 : 255;
|
||||
rgb_values[(y * inp_size + x) * 3 + 0] = v;
|
||||
rgb_values[(y * inp_size + x) * 3 + 1] = v;
|
||||
rgb_values[(y * inp_size + x) * 3 + 2] = v;
|
||||
}
|
||||
}
|
||||
} else if (input == "one") {
|
||||
pcm_samples = std::vector<float>(inp_size, 1.0f);
|
||||
} else if (input == "zero") {
|
||||
pcm_samples = std::vector<float>(inp_size, 0.0f);
|
||||
} else if (input == "half") {
|
||||
pcm_samples = std::vector<float>(inp_size, 0.5f);
|
||||
} else if (input == "440") {
|
||||
pcm_samples.resize(inp_size);
|
||||
float freq = 440.0f;
|
||||
float sample_rate = mtmd_get_audio_sample_rate(ctx_mtmd.get());
|
||||
float pi = 3.14159265f;
|
||||
for (int i = 0; i < inp_size; ++i) {
|
||||
pcm_samples[i] = sinf(2 * pi * freq * i / sample_rate);
|
||||
}
|
||||
} else {
|
||||
LOG_ERR("ERR: Invalid input specified with --image/--audio\n");
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// run preprocessing pass
|
||||
LOG_INF("Running preprocessing pass for input type: %s\n", input.c_str());
|
||||
if (pcm_samples.size() > 0) {
|
||||
LOG_INF("Input audio with %zu samples, type: %s\n", pcm_samples.size(), input.c_str());
|
||||
mtmd_debug_preprocess_audio(ctx_mtmd.get(), pcm_samples);
|
||||
} else {
|
||||
LOG_INF("Input image with dimensions %d x %d, type: %s\n", inp_size, inp_size, input.c_str());
|
||||
mtmd_debug_preprocess_image(ctx_mtmd.get(), rgb_values, inp_size, inp_size);
|
||||
}
|
||||
|
||||
} else {
|
||||
LOG_ERR("ERR: Invalid mode specified with -p\n");
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
17
tools/mtmd/debug/mtmd-debug.h
Normal file
17
tools/mtmd/debug/mtmd-debug.h
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// INTERNAL HEADER FOR DEBUGGING PURPOSES ONLY
|
||||
// NOT INTENDED FOR PUBLIC USE
|
||||
// Do not raise issues related to this debugging API
|
||||
|
||||
// encode take the pre-processed f32 values, print the intermidiate values via cb_eval callback
|
||||
MTMD_API void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<float>> & image);
|
||||
MTMD_API void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & input); // will be broadcasted to fit n_mel
|
||||
|
||||
// preprocess take the raw input values
|
||||
MTMD_API void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t> & rgb_values, int nx, int ny);
|
||||
MTMD_API void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> & pcm_samples);
|
||||
25
tools/mtmd/debug/mtmd-debug.md
Normal file
25
tools/mtmd/debug/mtmd-debug.md
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# mtmd-debug
|
||||
|
||||
## Debugging encode pass
|
||||
|
||||
Example of debugging an input gray image (raw, not preprocessed):
|
||||
|
||||
```py
|
||||
from transformers import AutoModel
|
||||
|
||||
model = AutoModel.from_pretrained(...)
|
||||
|
||||
def test_vision():
|
||||
img_size = 896 # number of patches per side
|
||||
pixel_values = torch.zeros(1, 3, img_size, img_size) + 0.5 # gray image
|
||||
with torch.no_grad():
|
||||
outputs = model.model.get_image_features(pixel_values=pixel_values)
|
||||
print("last_hidden_state shape:", outputs.last_hidden_state.shape)
|
||||
print("last_hidden_state:", outputs.last_hidden_state)
|
||||
|
||||
test_vision()
|
||||
```
|
||||
|
||||
## Debugging preprocess pass
|
||||
|
||||
(TODO)
|
||||
|
|
@ -470,12 +470,12 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
|
|||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||
if (bitrate < 0) {
|
||||
const int sample_rate = mtmd_get_audio_sample_rate(ctx);
|
||||
if (sample_rate < 0) {
|
||||
LOG_ERR("This model does not support audio input\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) {
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#include "clip-impl.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-audio.h"
|
||||
#include "debug/mtmd-debug.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
|
|
@ -912,7 +913,7 @@ bool mtmd_support_audio(mtmd_context * ctx) {
|
|||
return ctx->ctx_a != nullptr;
|
||||
}
|
||||
|
||||
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||
int mtmd_get_audio_sample_rate(mtmd_context * ctx) {
|
||||
if (!ctx->ctx_a) {
|
||||
return -1;
|
||||
}
|
||||
|
|
@ -1157,3 +1158,104 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
|
|||
g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
//
|
||||
// Debugging API (NOT intended for public use)
|
||||
//
|
||||
|
||||
static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip_image_f32 & image) {
|
||||
clip_set_debug_output_embeddings(ctx_clip, true);
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
int n_tokens = clip_n_output_tokens(ctx_clip, &image);
|
||||
std::vector<float> embd_output(n_tokens * n_mmproj_embd, 0.0f);
|
||||
bool ok = clip_image_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image,
|
||||
embd_output.data());
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to encode image\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<float>> & image) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return;
|
||||
}
|
||||
clip_image_f32 inp_image;
|
||||
inp_image.nx = image.size();
|
||||
inp_image.ny = inp_image.nx;
|
||||
inp_image.buf.reserve(inp_image.nx * inp_image.ny);
|
||||
for (const auto & row : image) {
|
||||
inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
|
||||
}
|
||||
LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
|
||||
mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
|
||||
}
|
||||
|
||||
void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & input) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return;
|
||||
}
|
||||
int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
|
||||
clip_image_f32 inp_audio;
|
||||
inp_audio.nx = input.size();
|
||||
inp_audio.ny = n_mel;
|
||||
inp_audio.buf.resize(input.size() * n_mel);
|
||||
for (size_t i = 0; i < input.size(); i++) {
|
||||
for (int j = 0; j < n_mel; j++) {
|
||||
inp_audio.buf[j * inp_audio.nx + i] = input[i];
|
||||
}
|
||||
}
|
||||
LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
|
||||
mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
|
||||
}
|
||||
|
||||
void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t> & rgb_values, int nx, int ny) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return;
|
||||
}
|
||||
clip_image_u8 img_u8;
|
||||
img_u8.nx = nx;
|
||||
img_u8.ny = ny;
|
||||
img_u8.buf = rgb_values;
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to preprocess image\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
|
||||
for (size_t i = 0; i < batch_f32.entries.size(); i++) {
|
||||
LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
|
||||
// TODO: better way to dump entry content?
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> & samples) {
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return;
|
||||
}
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
bool ok = ctx->audio_preproc->preprocess(samples.data(), samples.size(), mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to preprocess audio\n", __func__);
|
||||
return;
|
||||
}
|
||||
LOG_INF("%s: preprocessed audio to %zu mel spec chunks\n", __func__, mel_spec_chunks.size());
|
||||
for (size_t i = 0; i < mel_spec_chunks.size(); i++) {
|
||||
LOG_INF("%s: mel spec chunk %zu has n_len=%d, n_mel=%d\n", __func__, i, mel_spec_chunks[i].n_len, mel_spec_chunks[i].n_mel);
|
||||
|
||||
// dump mel entries: data is stored as [n_mel][n_len] (mel-major)
|
||||
const auto & mel = mel_spec_chunks[i];
|
||||
for (int m = 0; m < mel.n_mel; m++) {
|
||||
for (int t = 0; t < mel.n_len; t++) {
|
||||
LOG_INF("mel[%zu][m=%d][t=%d] = %f\n", i, m, t, mel.data[m * mel.n_len + t]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,9 +125,9 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
|||
// whether the current model supports audio input
|
||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||
|
||||
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||
// get audio sample rate in Hz, for example 16000 for Whisper
|
||||
// return -1 if audio is not supported
|
||||
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||
MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
|
||||
|
||||
// mtmd_bitmap
|
||||
//
|
||||
|
|
|
|||
|
|
@ -1189,6 +1189,9 @@ private:
|
|||
? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
|
||||
: SLOT_STATE_STARTED;
|
||||
|
||||
// reset server kill-switch counter
|
||||
n_empty_consecutive = 0;
|
||||
|
||||
SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,6 +101,40 @@ def test_embedding_mixed_input(input, is_multi_prompt: bool):
|
|||
assert len(data[0]['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_pooling_mean():
|
||||
global server
|
||||
server.pooling = 'mean'
|
||||
server.start()
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": "I believe the meaning of life is",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 1
|
||||
assert 'embedding' in res.body['data'][0]
|
||||
assert len(res.body['data'][0]['embedding']) > 1
|
||||
|
||||
# make sure embedding vector is normalized
|
||||
assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
|
||||
|
||||
|
||||
def test_embedding_pooling_mean_multiple():
|
||||
global server
|
||||
server.pooling = 'mean'
|
||||
server.start()
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": [
|
||||
"I believe the meaning of life is",
|
||||
"Write a joke about AI",
|
||||
"This is a test",
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 3
|
||||
for d in res.body['data']:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_pooling_none():
|
||||
global server
|
||||
server.pooling = 'none'
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@
|
|||
<iframe
|
||||
bind:this={iframeRef}
|
||||
title="Preview {language}"
|
||||
sandbox="allow-scripts allow-same-origin"
|
||||
sandbox="allow-scripts"
|
||||
class="code-preview-iframe"
|
||||
></iframe>
|
||||
|
||||
|
|
|
|||
44
vendor/cpp-httplib/httplib.cpp
vendored
44
vendor/cpp-httplib/httplib.cpp
vendored
|
|
@ -1995,9 +1995,9 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
|
|||
memcpy((*current)->ai_addr, sockaddr_ptr, sockaddr_len);
|
||||
|
||||
// Set port if service is specified
|
||||
if (service && strlen(service) > 0) {
|
||||
int port = atoi(service);
|
||||
if (port > 0) {
|
||||
if (service && *service) {
|
||||
int port = 0;
|
||||
if (parse_port(service, strlen(service), port)) {
|
||||
if (sockaddr_ptr->sa_family == AF_INET) {
|
||||
reinterpret_cast<struct sockaddr_in *>((*current)->ai_addr)
|
||||
->sin_port = htons(static_cast<uint16_t>(port));
|
||||
|
|
@ -3016,6 +3016,16 @@ bool read_headers(Stream &strm, Headers &headers) {
|
|||
header_count++;
|
||||
}
|
||||
|
||||
// RFC 9110 Section 8.6: Reject requests with multiple Content-Length
|
||||
// headers that have different values to prevent request smuggling.
|
||||
auto cl_range = headers.equal_range("Content-Length");
|
||||
if (cl_range.first != cl_range.second) {
|
||||
const auto &first_val = cl_range.first->second;
|
||||
for (auto it = std::next(cl_range.first); it != cl_range.second; ++it) {
|
||||
if (it->second != first_val) { return false; }
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -7522,6 +7532,10 @@ bool Server::listen_internal() {
|
|||
detail::set_socket_opt_time(sock, SOL_SOCKET, SO_SNDTIMEO,
|
||||
write_timeout_sec_, write_timeout_usec_);
|
||||
|
||||
if (tcp_nodelay_) {
|
||||
detail::set_socket_opt(sock, IPPROTO_TCP, TCP_NODELAY, 1);
|
||||
}
|
||||
|
||||
if (!task_queue->enqueue(
|
||||
[this, sock]() { process_and_close_socket(sock); })) {
|
||||
output_error_log(Error::ResourceExhaustion, nullptr);
|
||||
|
|
@ -8911,7 +8925,7 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
|
|||
|
||||
auto next_port = port_;
|
||||
if (!port_str.empty()) {
|
||||
next_port = std::stoi(port_str);
|
||||
if (!detail::parse_port(port_str, next_port)) { return false; }
|
||||
} else if (!next_scheme.empty()) {
|
||||
next_port = next_scheme == "https" ? 443 : 80;
|
||||
}
|
||||
|
|
@ -8962,18 +8976,10 @@ bool ClientImpl::create_redirect_client(
|
|||
// Setup basic client configuration first
|
||||
setup_redirect_client(redirect_client);
|
||||
|
||||
// SSL-specific configuration for proxy environments
|
||||
if (!proxy_host_.empty() && proxy_port_ != -1) {
|
||||
// Critical: Disable SSL verification for proxy environments
|
||||
redirect_client.enable_server_certificate_verification(false);
|
||||
redirect_client.enable_server_hostname_verification(false);
|
||||
} else {
|
||||
// For direct SSL connections, copy SSL verification settings
|
||||
redirect_client.enable_server_certificate_verification(
|
||||
server_certificate_verification_);
|
||||
redirect_client.enable_server_hostname_verification(
|
||||
server_hostname_verification_);
|
||||
}
|
||||
redirect_client.enable_server_certificate_verification(
|
||||
server_certificate_verification_);
|
||||
redirect_client.enable_server_hostname_verification(
|
||||
server_hostname_verification_);
|
||||
|
||||
// Transfer CA certificate to redirect client
|
||||
if (!ca_cert_pem_.empty()) {
|
||||
|
|
@ -10690,7 +10696,8 @@ Client::Client(const std::string &scheme_host_port,
|
|||
if (host.empty()) { host = m[3].str(); }
|
||||
|
||||
auto port_str = m[4].str();
|
||||
auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
|
||||
auto port = is_ssl ? 443 : 80;
|
||||
if (!port_str.empty() && !detail::parse_port(port_str, port)) { return; }
|
||||
|
||||
if (is_ssl) {
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
|
|
@ -16103,7 +16110,8 @@ WebSocketClient::WebSocketClient(
|
|||
if (host_.empty()) { host_ = m[3].str(); }
|
||||
|
||||
auto port_str = m[4].str();
|
||||
port_ = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
|
||||
port_ = is_ssl ? 443 : 80;
|
||||
if (!port_str.empty() && !detail::parse_port(port_str, port_)) { return; }
|
||||
|
||||
path_ = m[5].str();
|
||||
|
||||
|
|
|
|||
16
vendor/cpp-httplib/httplib.h
vendored
16
vendor/cpp-httplib/httplib.h
vendored
|
|
@ -8,8 +8,8 @@
|
|||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.37.1"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002501"
|
||||
#define CPPHTTPLIB_VERSION "0.37.2"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002502"
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
|
||||
|
|
@ -689,6 +689,18 @@ inline from_chars_result<double> from_chars(const char *first, const char *last,
|
|||
return {first + (endptr - s.c_str()), std::errc{}};
|
||||
}
|
||||
|
||||
inline bool parse_port(const char *s, size_t len, int &port) {
|
||||
int val = 0;
|
||||
auto r = from_chars(s, s + len, val);
|
||||
if (r.ec != std::errc{} || val < 1 || val > 65535) { return false; }
|
||||
port = val;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool parse_port(const std::string &s, int &port) {
|
||||
return parse_port(s.data(), s.size(), port);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
enum SSLVerifierResponse {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue