mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/backend/CANN.md # docs/multimodal/minicpmo2.6.md # docs/multimodal/minicpmv2.5.md # docs/multimodal/minicpmv2.6.md # examples/speculative-simple/speculative-simple.cpp # ggml/cmake/ggml-config.cmake.in # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/add.cl # ggml/src/ggml-opencl/kernels/mul.cl # scripts/compare-commits.sh # scripts/compare-llama-bench.py # scripts/sync-ggml.last # tools/server/README.md
This commit is contained in:
commit
f430916a71
57 changed files with 6028 additions and 731 deletions
130
.devops/cann.Dockerfile
Normal file
130
.devops/cann.Dockerfile
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
# ==============================================================================
|
||||||
|
# ARGUMENTS
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Define the CANN base image for easier version updates later
|
||||||
|
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# BUILD STAGE
|
||||||
|
# Compile all binary files and libraries
|
||||||
|
# ==============================================================================
|
||||||
|
FROM ${CANN_BASE_IMAGE} AS build
|
||||||
|
|
||||||
|
# Define the Ascend chip model for compilation. Default is Ascend910B3
|
||||||
|
ARG ASCEND_SOC_TYPE=Ascend910B3
|
||||||
|
|
||||||
|
# -- Install build dependencies --
|
||||||
|
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# -- Set the working directory --
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# -- Copy project files --
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# -- Set CANN environment variables (required for compilation) --
|
||||||
|
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
|
||||||
|
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||||
|
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||||
|
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||||
|
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
|
# ... You can add other environment variables from the original file as needed ...
|
||||||
|
# For brevity, only core variables are listed here. You can paste the original ENV list here.
|
||||||
|
|
||||||
|
# -- Build llama.cpp --
|
||||||
|
# Use the passed ASCEND_SOC_TYPE argument and add general build options
|
||||||
|
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
|
||||||
|
&& \
|
||||||
|
cmake -B build \
|
||||||
|
-DGGML_CANN=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
|
||||||
|
. && \
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
# -- Organize build artifacts for copying in later stages --
|
||||||
|
# Create a lib directory to store all .so files
|
||||||
|
RUN mkdir -p /app/lib && \
|
||||||
|
find build -name "*.so" -exec cp {} /app/lib \;
|
||||||
|
|
||||||
|
# Create a full directory to store all executables and Python scripts
|
||||||
|
RUN mkdir -p /app/full && \
|
||||||
|
cp build/bin/* /app/full/ && \
|
||||||
|
cp *.py /app/full/ && \
|
||||||
|
cp -r gguf-py /app/full/ && \
|
||||||
|
cp -r requirements /app/full/ && \
|
||||||
|
cp requirements.txt /app/full/
|
||||||
|
# If you have a tools.sh script, make sure it is copied here
|
||||||
|
# cp .devops/tools.sh /app/full/tools.sh
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# BASE STAGE
|
||||||
|
# Create a minimal base image with CANN runtime and common libraries
|
||||||
|
# ==============================================================================
|
||||||
|
FROM ${CANN_BASE_IMAGE} AS base
|
||||||
|
|
||||||
|
# -- Install runtime dependencies --
|
||||||
|
RUN yum install -y libgomp curl && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# -- Set CANN environment variables (required for runtime) --
|
||||||
|
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||||
|
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||||
|
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||||
|
# ... You can add other environment variables from the original file as needed ...
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy compiled .so files from the build stage
|
||||||
|
COPY --from=build /app/lib/ /app
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# FINAL STAGES (TARGETS)
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
### Target: full
|
||||||
|
# Complete image with all tools, Python bindings, and dependencies
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS full
|
||||||
|
|
||||||
|
COPY --from=build /app/full /app
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN yum install -y git python3 python3-pip && \
|
||||||
|
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||||
|
pip3 install --no-cache-dir -r requirements.txt && \
|
||||||
|
yum clean all && \
|
||||||
|
rm -rf /var/cache/yum
|
||||||
|
|
||||||
|
# You need to provide a tools.sh script as the entrypoint
|
||||||
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
# If there is no tools.sh, you can set the default to start the server
|
||||||
|
# ENTRYPOINT ["/app/llama-server"]
|
||||||
|
|
||||||
|
### Target: light
|
||||||
|
# Lightweight image containing only llama-cli
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS light
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-cli /app
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-cli" ]
|
||||||
|
|
||||||
|
### Target: server
|
||||||
|
# Dedicated server image containing only llama-server
|
||||||
|
# ==============================================================================
|
||||||
|
FROM base AS server
|
||||||
|
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
COPY --from=build /app/full/llama-server /app
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/llama-server" ]
|
|
@ -979,6 +979,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||||
string_process_escapes(seq_breaker);
|
string_process_escapes(seq_breaker);
|
||||||
}
|
}
|
||||||
|
for (auto & pair : params.speculative.replacements) {
|
||||||
|
string_process_escapes(pair.first);
|
||||||
|
string_process_escapes(pair.second);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.kv_overrides.empty()) {
|
if (!params.kv_overrides.empty()) {
|
||||||
|
@ -2093,6 +2097,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.no_kv_offload = true;
|
params.no_kv_offload = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-nr", "--no-repack"},
|
||||||
|
"disable weight repacking",
|
||||||
|
[](common_params & params) {
|
||||||
|
params.no_extra_bufts = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_NO_REPACK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||||
string_format(
|
string_format(
|
||||||
|
@ -2371,6 +2382,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--cpu-moe"},
|
||||||
|
"use CPU for Mixture of Experts (MoE) weights",
|
||||||
|
[](common_params & params) {
|
||||||
|
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||||
|
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||||
|
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||||
"number of layers to store in VRAM",
|
"number of layers to store in VRAM",
|
||||||
|
@ -3251,6 +3271,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.speculative.model.path = value;
|
params.speculative.model.path = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--spec-replace"}, "TARGET", "DRAFT",
|
||||||
|
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
||||||
|
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
||||||
|
params.speculative.replacements.push_back({ tgt, dft });
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
||||||
string_format(
|
string_format(
|
||||||
|
@ -3440,28 +3467,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
// diffusion parameters
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{ "--diffusion-steps" }, "N",
|
{ "--diffusion-steps" }, "N",
|
||||||
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
||||||
[](common_params & params, int value) { params.diffusion.steps = value; }
|
[](common_params & params, int value) { params.diffusion.steps = value; }
|
||||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
add_opt(common_arg(
|
|
||||||
{ "--diffusion-eps" }, "F",
|
|
||||||
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
||||||
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
||||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{ "--diffusion-algorithm" }, "N",
|
|
||||||
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
|
||||||
params.diffusion.algorithm),
|
|
||||||
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
||||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{ "--diffusion-alg-temp" }, "F",
|
|
||||||
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
||||||
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
||||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{ "--diffusion-visual" },
|
{ "--diffusion-visual" },
|
||||||
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
||||||
|
@ -3469,5 +3479,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params) { params.diffusion.visual_mode = true; }
|
[](common_params & params) { params.diffusion.visual_mode = true; }
|
||||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-eps" }, "F",
|
||||||
|
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
||||||
|
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-algorithm" }, "N",
|
||||||
|
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
|
||||||
|
params.diffusion.algorithm),
|
||||||
|
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-alg-temp" }, "F",
|
||||||
|
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
||||||
|
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-block-length" }, "N",
|
||||||
|
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
||||||
|
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-cfg-scale" }, "F",
|
||||||
|
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
||||||
|
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{ "--diffusion-add-gumbel-noise" }, "F",
|
||||||
|
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
||||||
|
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
||||||
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||||
|
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1130,6 +1130,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||||
|
|
||||||
if (params.kv_overrides.empty()) {
|
if (params.kv_overrides.empty()) {
|
||||||
mparams.kv_overrides = NULL;
|
mparams.kv_overrides = NULL;
|
||||||
|
|
|
@ -197,6 +197,7 @@ struct common_params_speculative {
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||||
|
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||||
|
|
||||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||||
|
@ -216,11 +217,17 @@ struct common_params_vocoder {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params_diffusion {
|
struct common_params_diffusion {
|
||||||
int32_t steps = 64; // number of diffusion steps
|
int32_t steps = 128;
|
||||||
float eps = 1e-3f; // epsilon for timesteps
|
bool visual_mode = false;
|
||||||
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
|
||||||
|
float eps = 0; // epsilon for timesteps
|
||||||
|
int32_t block_length = 0; // block length for generation
|
||||||
|
|
||||||
|
int32_t algorithm = 4; // default algorithm: low-confidence
|
||||||
float alg_temp = 0.0f; // algorithm temperature
|
float alg_temp = 0.0f; // algorithm temperature
|
||||||
bool visual_mode = false; // show progressive diffusion on screen
|
|
||||||
|
float cfg_scale = 0; // classifier-free guidance scale
|
||||||
|
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
||||||
};
|
};
|
||||||
|
|
||||||
enum common_reasoning_format {
|
enum common_reasoning_format {
|
||||||
|
@ -348,6 +355,7 @@ struct common_params {
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||||
|
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
||||||
|
|
||||||
bool single_turn = false; // single turn chat conversation
|
bool single_turn = false; // single turn chat conversation
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,39 @@
|
||||||
#include "speculative.h"
|
#include "speculative.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
|
||||||
struct common_speculative {
|
struct common_speculative {
|
||||||
struct llama_context * ctx;
|
struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||||
|
struct llama_context * ctx_dft;
|
||||||
struct common_sampler * smpl;
|
struct common_sampler * smpl;
|
||||||
|
|
||||||
llama_batch batch;
|
llama_batch batch;
|
||||||
llama_tokens prompt;
|
llama_tokens prompt_dft;
|
||||||
|
bool vocab_dft_compatible = true; // whether retokenization is needed
|
||||||
|
std::map<std::string, std::string> tgt_dft_replacements = {};
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(
|
struct common_speculative * common_speculative_init(
|
||||||
|
struct llama_context * ctx_tgt,
|
||||||
struct llama_context * ctx_dft) {
|
struct llama_context * ctx_dft) {
|
||||||
auto * result = new common_speculative {
|
auto * result = new common_speculative {
|
||||||
/* .ctx = */ ctx_dft,
|
/* .ctx_tgt = */ ctx_tgt,
|
||||||
|
/* .ctx_dft = */ ctx_dft,
|
||||||
/* .smpl = */ nullptr,
|
/* .smpl = */ nullptr,
|
||||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||||
/* .prompt = */ {},
|
/* .prompt_dft = */ {},
|
||||||
|
/* .vocab_dft_compatible = */ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: optimize or pass from outside?
|
// TODO: optimize or pass from outside?
|
||||||
|
@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
|
||||||
|
LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,31 +102,32 @@ bool common_speculative_are_compatible(
|
||||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||||
|
|
||||||
if (vocab_type_tgt != vocab_type_dft) {
|
if (vocab_type_tgt != vocab_type_dft) {
|
||||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
if (
|
||||||
|
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
|
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||||
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
|
) {
|
||||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
|
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
||||||
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
||||||
|
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||||
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
? n_vocab_tgt - n_vocab_dft
|
||||||
|
: n_vocab_dft - n_vocab_tgt;
|
||||||
|
|
||||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||||
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||||
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,8 +135,8 @@ bool common_speculative_are_compatible(
|
||||||
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
||||||
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
||||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
|
LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||||
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
|
||||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
common_token_to_piece(ctx_dft, i).c_str());
|
common_token_to_piece(ctx_dft, i).c_str());
|
||||||
return false;
|
return false;
|
||||||
|
@ -134,32 +147,93 @@ bool common_speculative_are_compatible(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_speculative_add_replacement_tgt_dft(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
const char *source, const char *dest) {
|
||||||
|
spec->tgt_dft_replacements[source] = dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string replace_to_dft(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
const std::string& input) {
|
||||||
|
std::string result = input;
|
||||||
|
for (const auto & pair : spec->tgt_dft_replacements) {
|
||||||
|
size_t pos = result.find(pair.first);
|
||||||
|
while (pos != std::string::npos) {
|
||||||
|
result.replace(pos, pair.first.length(), pair.second);
|
||||||
|
pos = result.find(pair.first, pos + pair.second.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string replace_to_tgt(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
const std::string& input) {
|
||||||
|
std::string result = input;
|
||||||
|
for (const auto& pair : spec->tgt_dft_replacements) {
|
||||||
|
size_t pos = result.find(pair.second);
|
||||||
|
while (pos != std::string::npos) {
|
||||||
|
result.replace(pos, pair.second.length(), pair.first);
|
||||||
|
pos = result.find(pair.second, pos + pair.first.length());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
llama_tokens common_speculative_gen_draft(
|
llama_tokens common_speculative_gen_draft(
|
||||||
struct common_speculative * spec,
|
struct common_speculative * spec,
|
||||||
struct common_speculative_params params,
|
struct common_speculative_params params,
|
||||||
const llama_tokens & prompt_tgt,
|
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
||||||
llama_token id_last) {
|
llama_token id_last) {
|
||||||
auto & batch = spec->batch;
|
auto & batch = spec->batch;
|
||||||
auto & ctx = spec->ctx;
|
auto & ctx_tgt = spec->ctx_tgt;
|
||||||
|
auto & ctx_dft = spec->ctx_dft;
|
||||||
auto & smpl = spec->smpl;
|
auto & smpl = spec->smpl;
|
||||||
auto & prompt = spec->prompt;
|
auto & prompt_dft = spec->prompt_dft;
|
||||||
|
|
||||||
auto * mem = llama_get_memory(ctx);
|
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||||
|
|
||||||
int reuse_i = 0;
|
int reuse_i = 0;
|
||||||
int reuse_n = 0;
|
int reuse_n = 0;
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
|
||||||
|
|
||||||
|
llama_tokens prompt_tgt_draft_model;
|
||||||
|
if (!spec->vocab_dft_compatible) {
|
||||||
|
std::string text;
|
||||||
|
text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
||||||
|
text = replace_to_dft(spec, text);
|
||||||
|
LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||||
|
prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
|
||||||
|
|
||||||
|
// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
|
||||||
|
const auto * model_tgt = llama_get_model(ctx_tgt);
|
||||||
|
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||||
|
|
||||||
|
int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
|
||||||
|
GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
|
||||||
|
text.resize(-n_chars);
|
||||||
|
llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
|
||||||
|
text = replace_to_dft(spec, text);
|
||||||
|
|
||||||
|
LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||||
|
id_last = common_tokenize(ctx_dft, text, false, true)[0];
|
||||||
|
}
|
||||||
|
// prompt_tgt's tokens will always be compatible with ctx_dft
|
||||||
|
const llama_tokens &prompt_tgt =
|
||||||
|
spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
|
||||||
|
|
||||||
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
||||||
|
|
||||||
// reuse as much as possible from the old draft context
|
// reuse as much as possible from the old draft context
|
||||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||||
for (int i = 0; i < (int) prompt.size(); ++i) {
|
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
|
||||||
int cur = 0;
|
int cur = 0;
|
||||||
while (i_start + cur < (int) prompt_tgt.size() &&
|
while (i_start + cur < (int) prompt_tgt.size() &&
|
||||||
i + cur < (int) prompt.size() &&
|
i + cur < (int) prompt_dft.size() &&
|
||||||
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
|
||||||
cur++;
|
cur++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||||
|
|
||||||
llama_tokens result;
|
llama_tokens result;
|
||||||
result.reserve(params.n_draft);
|
result.reserve(params.n_draft);
|
||||||
|
|
||||||
if (reuse_n == 0) {
|
if (reuse_n == 0) {
|
||||||
llama_memory_clear(mem, false);
|
llama_memory_clear(mem_dft, false);
|
||||||
|
prompt_dft.clear();
|
||||||
prompt.clear();
|
|
||||||
} else {
|
} else {
|
||||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||||
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||||
result.push_back(prompt[i]);
|
result.push_back(prompt_dft[i]);
|
||||||
|
|
||||||
if (params.n_draft <= (int) result.size()) {
|
if (params.n_draft <= (int) result.size()) {
|
||||||
break;
|
break;
|
||||||
|
@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reuse_i > 0) {
|
if (reuse_i > 0) {
|
||||||
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||||
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||||
|
|
||||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (reuse_n < (int) prompt.size()) {
|
if (reuse_n < (int) prompt_dft.size()) {
|
||||||
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||||
|
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft(
|
||||||
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||||
|
|
||||||
prompt.push_back(prompt_tgt[i]);
|
prompt_dft.push_back(prompt_tgt[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// we should rarely end-up here during normal decoding
|
// we should rarely end-up here during normal decoding
|
||||||
if (batch.n_tokens > 0) {
|
if (batch.n_tokens > 0) {
|
||||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||||
|
|
||||||
llama_decode(ctx, batch);
|
llama_decode(ctx_dft, batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_pos n_past = prompt.size();
|
const llama_pos n_past = prompt_dft.size();
|
||||||
|
|
||||||
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
||||||
|
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||||
|
|
||||||
prompt.push_back(id_last);
|
prompt_dft.push_back(id_last);
|
||||||
|
|
||||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||||
|
|
||||||
llama_decode(ctx, batch);
|
llama_decode(ctx_dft, batch);
|
||||||
|
|
||||||
common_sampler_reset(smpl);
|
common_sampler_reset(smpl);
|
||||||
|
|
||||||
|
@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft(
|
||||||
for (int i = 0; i < params.n_draft; ++i) {
|
for (int i = 0; i < params.n_draft; ++i) {
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
common_sampler_sample(smpl, ctx, 0, true);
|
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||||
|
|
||||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||||
|
|
||||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// add drafted token for each sequence
|
// add drafted token for each sequence
|
||||||
|
@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft(
|
||||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||||
|
|
||||||
// evaluate the drafted tokens on the draft model
|
// evaluate the drafted tokens on the draft model
|
||||||
llama_decode(ctx, batch);
|
llama_decode(ctx_dft, batch);
|
||||||
|
|
||||||
prompt.push_back(id);
|
prompt_dft.push_back(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!spec->vocab_dft_compatible) {
|
||||||
|
std::string detokenized = common_detokenize(ctx_dft, result, true);
|
||||||
|
detokenized = replace_to_tgt(spec, detokenized);
|
||||||
|
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||||
|
result = common_tokenize(ctx_tgt, detokenized, false, true);
|
||||||
|
if (result.size() > (size_t)params.n_draft) {
|
||||||
|
result.resize(params.n_draft);
|
||||||
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,10 @@ struct common_speculative_params {
|
||||||
float p_min = 0.75f; // min probability required to accept a token in the draft
|
float p_min = 0.75f; // min probability required to accept a token in the draft
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
struct common_speculative * common_speculative_init(
|
||||||
|
struct llama_context * ctx_tgt,
|
||||||
|
struct llama_context * ctx_dft
|
||||||
|
);
|
||||||
|
|
||||||
void common_speculative_free(struct common_speculative * spec);
|
void common_speculative_free(struct common_speculative * spec);
|
||||||
|
|
||||||
|
@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
|
||||||
const struct llama_context * ctx_tgt,
|
const struct llama_context * ctx_tgt,
|
||||||
const struct llama_context * ctx_dft);
|
const struct llama_context * ctx_dft);
|
||||||
|
|
||||||
|
void common_speculative_add_replacement_tgt_dft(
|
||||||
|
struct common_speculative * spec,
|
||||||
|
const char *source, const char *dest);
|
||||||
|
|
||||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||||
llama_tokens common_speculative_gen_draft(
|
llama_tokens common_speculative_gen_draft(
|
||||||
struct common_speculative * spec,
|
struct common_speculative * spec,
|
||||||
|
|
|
@ -684,6 +684,9 @@ class TextModel(ModelBase):
|
||||||
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
||||||
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
||||||
res = "hunyuan"
|
res = "hunyuan"
|
||||||
|
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
|
||||||
|
# ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct
|
||||||
|
res = "hunyuan-dense"
|
||||||
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
||||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
||||||
res = "falcon-h1"
|
res = "falcon-h1"
|
||||||
|
@ -2904,6 +2907,107 @@ class DreamModel(TextModel):
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("LLaDAModelLM")
|
||||||
|
class LLaDAModel(TextModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.LLADA
|
||||||
|
undo_permute = True
|
||||||
|
|
||||||
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||||
|
tokens: list[str] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
|
||||||
|
vocab_dict = tokenizer.get_vocab()
|
||||||
|
vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
|
||||||
|
assert max(vocab_dict.values()) < vocab_size
|
||||||
|
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
|
|
||||||
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
|
||||||
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
elif reverse_vocab[i] in added_vocab:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
# Check if it's a special token - treat special tokens as CONTROL tokens
|
||||||
|
if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
|
||||||
|
if tokenizer.added_tokens_decoder[i].special:
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
else:
|
||||||
|
# Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
return tokens, toktypes, tokpre
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
# LLaDA specific parameters
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self._try_set_pooling_type()
|
||||||
|
|
||||||
|
# Add parameters similar to LlamaModel
|
||||||
|
hparams = self.hparams
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
|
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
|
||||||
|
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
# Set context length for LLaDA
|
||||||
|
context_length = self.hparams.get("max_sequence_length", 4096)
|
||||||
|
self.gguf_writer.add_context_length(context_length)
|
||||||
|
|
||||||
|
# Set embedding length (dimension size)
|
||||||
|
embedding_length = self.hparams.get("d_model", 4096)
|
||||||
|
self.gguf_writer.add_embedding_length(embedding_length)
|
||||||
|
|
||||||
|
# Set feed forward length (MLP hidden size)
|
||||||
|
feed_forward_length = self.hparams.get("mlp_hidden_size", 12288)
|
||||||
|
self.gguf_writer.add_feed_forward_length(feed_forward_length)
|
||||||
|
|
||||||
|
# LLaDA models use non-causal attention for diffusion, similar to Dream
|
||||||
|
self.gguf_writer.add_causal_attention(False)
|
||||||
|
|
||||||
|
# LLaDA models don't shift their logits
|
||||||
|
self.gguf_writer.add_diffusion_shift_logits(False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||||
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
|
n_head = n_head_kv
|
||||||
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
|
||||||
|
|
||||||
|
if self.undo_permute:
|
||||||
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
|
data_torch = LLaDAModel.permute(data_torch, n_head, n_head)
|
||||||
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
|
data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
|
# LLaDA model tensors should be mapped directly since it's the base model
|
||||||
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Ernie4_5_ForCausalLM")
|
@ModelBase.register("Ernie4_5_ForCausalLM")
|
||||||
class Ernie4_5Model(TextModel):
|
class Ernie4_5Model(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
model_arch = gguf.MODEL_ARCH.ERNIE4_5
|
||||||
|
@ -7452,11 +7556,6 @@ class FalconH1Model(Mamba2Model):
|
||||||
class HunYuanMoEModel(TextModel):
|
class HunYuanMoEModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
# For handling tied embeddings
|
|
||||||
self._tok_embd = None
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
@ -7550,9 +7649,6 @@ class HunYuanMoEModel(TextModel):
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
if name == "model.embed_tokens.weight":
|
|
||||||
self._tok_embd = data_torch.clone()
|
|
||||||
|
|
||||||
if name == "lm_head.weight":
|
if name == "lm_head.weight":
|
||||||
if self.hparams.get("tie_word_embeddings", False):
|
if self.hparams.get("tie_word_embeddings", False):
|
||||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||||
|
@ -7597,6 +7693,98 @@ class HunYuanMoEModel(TextModel):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||||
|
class HunYuanModel(TextModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
if (self.dir_model / "tokenizer.json").is_file():
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
else:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
|
||||||
|
# 1. Get the pre-tokenizer identifier hash
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
|
|
||||||
|
# 2. Reverse-engineer the merges list from mergeable_ranks
|
||||||
|
merges = []
|
||||||
|
vocab = {}
|
||||||
|
mergeable_ranks = tokenizer.mergeable_ranks
|
||||||
|
for token, rank in mergeable_ranks.items():
|
||||||
|
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||||
|
if len(token) == 1:
|
||||||
|
continue
|
||||||
|
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
|
if len(merged) == 2:
|
||||||
|
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
|
# 3. Generate the tokens and toktypes lists
|
||||||
|
vocab_size = self.hparams["vocab_size"]
|
||||||
|
assert tokenizer.vocab_size == vocab_size
|
||||||
|
special_tokens = tokenizer.special_tokens
|
||||||
|
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
||||||
|
tokens: list[str] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
else:
|
||||||
|
token = reverse_vocab[i]
|
||||||
|
tokens.append(token)
|
||||||
|
if i in special_tokens.values():
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
# 4. Write all vocab-related fields to the GGUF writer
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_token_merges(merges)
|
||||||
|
|
||||||
|
# 5. Add special tokens and chat templates
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||||
|
if self.hparams['hidden_size'] == 4096:
|
||||||
|
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
# Rope
|
||||||
|
rope_scaling = hparams.get("rope_scaling", {})
|
||||||
|
if rope_scaling.get("type") == "dynamic":
|
||||||
|
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||||
|
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||||
|
alpha = rope_scaling.get("alpha", 50)
|
||||||
|
base = hparams.get("rope_theta", 10000.0)
|
||||||
|
dim = hparams["head_dim"]
|
||||||
|
scaled_base = base * (alpha ** (dim / (dim - 2)))
|
||||||
|
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1)
|
||||||
|
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||||
|
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||||
|
|
||||||
|
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||||
|
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||||
|
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
if name == "lm_head.weight":
|
||||||
|
if self.hparams.get("tie_word_embeddings", False):
|
||||||
|
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("SmolLM3ForCausalLM")
|
@ModelBase.register("SmolLM3ForCausalLM")
|
||||||
class SmolLM3Model(LlamaModel):
|
class SmolLM3Model(LlamaModel):
|
||||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||||
|
|
|
@ -140,6 +140,7 @@ pre_computed_hashes = [
|
||||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||||
|
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
|
||||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||||
|
|
47
docs/multimodal/minicpmo4.0.md
Normal file
47
docs/multimodal/minicpmo4.0.md
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
## MiniCPM-o 4
|
||||||
|
|
||||||
|
### Prepare models and code
|
||||||
|
|
||||||
|
Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
|
||||||
|
|
||||||
|
|
||||||
|
### Build llama.cpp
|
||||||
|
Readme modification time: 20250206
|
||||||
|
|
||||||
|
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
|
||||||
|
Clone llama.cpp:
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
Build llama.cpp using `CMake`:
|
||||||
|
```bash
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Usage of MiniCPM-o 4
|
||||||
|
|
||||||
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
|
||||||
|
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
|
||||||
|
python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
|
||||||
|
|
||||||
|
# quantize int4 version
|
||||||
|
./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Inference on Linux or Mac
|
||||||
|
```bash
|
||||||
|
# run in single-turn mode
|
||||||
|
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
|
# run in conversation mode
|
||||||
|
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
|
||||||
|
```
|
47
docs/multimodal/minicpmv4.0.md
Normal file
47
docs/multimodal/minicpmv4.0.md
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
## MiniCPM-V 4
|
||||||
|
|
||||||
|
### Prepare models and code
|
||||||
|
|
||||||
|
Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
|
||||||
|
|
||||||
|
|
||||||
|
### Build llama.cpp
|
||||||
|
Readme modification time: 20250206
|
||||||
|
|
||||||
|
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||||
|
|
||||||
|
Clone llama.cpp:
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
Build llama.cpp using `CMake`:
|
||||||
|
```bash
|
||||||
|
cmake -B build
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Usage of MiniCPM-V 4
|
||||||
|
|
||||||
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
|
||||||
|
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
|
||||||
|
python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
|
||||||
|
|
||||||
|
# quantize int4 version
|
||||||
|
./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Inference on Linux or Mac
|
||||||
|
```bash
|
||||||
|
# run in single-turn mode
|
||||||
|
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||||
|
|
||||||
|
# run in conversation mode
|
||||||
|
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
|
||||||
|
```
|
13
examples/diffusion/README.md
Normal file
13
examples/diffusion/README.md
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# Diffusion Text Generation
|
||||||
|
|
||||||
|
This directory contains implementations for Diffusion LLMs (DLLMs)
|
||||||
|
|
||||||
|
More Info:
|
||||||
|
- https://github.com/ggml-org/llama.cpp/pull/14644
|
||||||
|
- https://github.com/ggml-org/llama.cpp/pull/14771
|
||||||
|
|
||||||
|
|
||||||
|
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
|
||||||
|
|
||||||
|
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
|
||||||
|
|
|
@ -5,12 +5,22 @@
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
|
||||||
|
|
||||||
|
// Unified transfer scheduling methods
|
||||||
|
enum transfer_schedule {
|
||||||
|
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
|
||||||
|
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
|
||||||
|
};
|
||||||
|
|
||||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||||
int32_t total_steps,
|
int32_t total_steps,
|
||||||
|
@ -18,325 +28,99 @@ typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||||
int32_t n_tokens,
|
int32_t n_tokens,
|
||||||
void * user_data);
|
void * user_data);
|
||||||
|
|
||||||
enum diffusion_alg {
|
|
||||||
DIFFUSION_ALG_ORIGIN = 0,
|
|
||||||
DIFFUSION_ALG_MASKGIT_PLUS = 1,
|
|
||||||
DIFFUSION_ALG_TOPK_MARGIN = 2,
|
|
||||||
DIFFUSION_ALG_ENTROPY = 3,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct diffusion_params {
|
struct diffusion_params {
|
||||||
int32_t steps;
|
int32_t steps = 0;
|
||||||
float eps;
|
float temperature = 0;
|
||||||
float temperature;
|
llama_token mask_token_id = LLAMA_TOKEN_NULL;
|
||||||
float top_p;
|
diffusion_step_callback_t step_callback = nullptr;
|
||||||
int32_t top_k;
|
void * step_callback_user_data = nullptr;
|
||||||
llama_token mask_token_id;
|
int32_t seed = 0;
|
||||||
enum diffusion_alg algorithm;
|
bool visual_mode = false;
|
||||||
float alg_temp;
|
bool shift_logits = false; // Shift logits by -1 after decode
|
||||||
diffusion_step_callback_t step_callback;
|
|
||||||
void * step_callback_user_data;
|
float top_p = 0.;
|
||||||
int32_t seed;
|
int32_t top_k = 0.;
|
||||||
|
|
||||||
|
diffusion_algorithm algorithm = CONFIDENCE_BASED;
|
||||||
|
transfer_schedule schedule = TIMESTEP_BASED;
|
||||||
|
|
||||||
|
float cfg_scale = 0.; // Config scale for classifier-free guidance
|
||||||
|
float eps = 0.; // Timestep scheduling
|
||||||
|
int32_t block_length = 0; // Block size (for block scheduling)
|
||||||
|
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
|
||||||
|
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
|
||||||
|
|
||||||
|
int32_t max_length = 0; // Maximum sequence length
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static diffusion_params diffusion_default_params() {
|
|
||||||
diffusion_params params = {};
|
|
||||||
params.steps = 64;
|
|
||||||
params.eps = 1e-3f;
|
|
||||||
params.temperature = 0.2f;
|
|
||||||
params.top_p = 0.95f;
|
|
||||||
params.top_k = 0;
|
|
||||||
params.mask_token_id = LLAMA_TOKEN_NULL;
|
|
||||||
params.algorithm = DIFFUSION_ALG_ORIGIN;
|
|
||||||
params.alg_temp = 0.0f;
|
|
||||||
params.step_callback = nullptr;
|
|
||||||
params.step_callback_user_data = nullptr;
|
|
||||||
params.seed = 0;
|
|
||||||
return params;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void diffusion_generate(llama_context * ctx,
|
|
||||||
const llama_token * input_tokens,
|
|
||||||
llama_token * output_tokens,
|
|
||||||
int32_t n_input,
|
|
||||||
int32_t max_length,
|
|
||||||
struct diffusion_params params,
|
|
||||||
int32_t & n_generated) {
|
|
||||||
|
|
||||||
n_generated = 0;
|
|
||||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const llama_model * model = llama_get_model(ctx);
|
|
||||||
|
|
||||||
// Initialize with input and pad with mask tokens
|
|
||||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
|
||||||
std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
std::vector<float> timesteps(params.steps + 1);
|
|
||||||
for (int32_t i = 0; i <= params.steps; i++) {
|
|
||||||
timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_set_causal_attn(ctx, false);
|
|
||||||
|
|
||||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates(n_vocab);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> conf_candidates;
|
|
||||||
conf_candidates.reserve(max_length);
|
|
||||||
|
|
||||||
std::vector<int32_t> mask_positions;
|
|
||||||
mask_positions.reserve(max_length);
|
|
||||||
|
|
||||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
|
||||||
if (params.top_k > 0) {
|
|
||||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
|
||||||
}
|
|
||||||
if (params.top_p < 1.0f) {
|
|
||||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
|
||||||
}
|
|
||||||
if (params.temperature > 0.0f) {
|
|
||||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
|
||||||
}
|
|
||||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
|
||||||
|
|
||||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(max_length, 0, 1);
|
|
||||||
batch.n_tokens = max_length;
|
|
||||||
|
|
||||||
int64_t total_sampling_time = 0;
|
|
||||||
int64_t total_time = 0;
|
|
||||||
|
|
||||||
int64_t time_start = ggml_time_us();
|
|
||||||
for (int32_t step = 0; step < params.steps; step++) {
|
|
||||||
if (params.step_callback) {
|
|
||||||
if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < max_length; i++) {
|
|
||||||
batch.token[i] = output_tokens[i];
|
|
||||||
batch.pos[i] = i;
|
|
||||||
batch.n_seq_id[i] = 1;
|
|
||||||
batch.seq_id[i][0] = 0;
|
|
||||||
batch.logits[i] = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ret = llama_decode(ctx, batch);
|
|
||||||
if (ret != 0) {
|
|
||||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * raw_logits = llama_get_logits(ctx);
|
|
||||||
if (!raw_logits) {
|
|
||||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
|
||||||
return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
|
|
||||||
};
|
|
||||||
|
|
||||||
int64_t time_start_sampling = ggml_time_us();
|
|
||||||
|
|
||||||
mask_positions.clear();
|
|
||||||
for (int32_t i = 0; i < max_length; i++) {
|
|
||||||
if (output_tokens[i] == params.mask_token_id) {
|
|
||||||
mask_positions.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mask_positions.empty()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
float t = timesteps[step];
|
|
||||||
float s = timesteps[step + 1];
|
|
||||||
|
|
||||||
if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
|
|
||||||
float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
|
|
||||||
|
|
||||||
for (int32_t pos : mask_positions) {
|
|
||||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
|
||||||
const float * pos_logits = get_logits_for_pos(pos);
|
|
||||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates[token_id].id = token_id;
|
|
||||||
candidates[token_id].logit = pos_logits[token_id];
|
|
||||||
candidates[token_id].p = 0.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = {
|
|
||||||
/* .data = */ candidates.data(),
|
|
||||||
/* .size = */ (size_t) n_vocab, // Reset size to full vocab
|
|
||||||
/* .selected = */ -1,
|
|
||||||
/* .sorted = */ false,
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_apply(sampler, &cur_p);
|
|
||||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
std::vector<std::pair<float, int32_t>> confidences;
|
|
||||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
|
||||||
int32_t pos = mask_positions[i];
|
|
||||||
const float * pos_logits = get_logits_for_pos(pos);
|
|
||||||
|
|
||||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates[token_id].logit = pos_logits[token_id];
|
|
||||||
candidates[token_id].p = 0.0f;
|
|
||||||
candidates[token_id].id = token_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = {
|
|
||||||
/* .data = */ candidates.data(),
|
|
||||||
/* .size = */ candidates.size(),
|
|
||||||
/* .selected = */ -1,
|
|
||||||
/* .sorted = */ false,
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_apply(sampler, &cur_p);
|
|
||||||
|
|
||||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
|
||||||
|
|
||||||
float confidence = 0.0f;
|
|
||||||
if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
|
|
||||||
const float epsilon = 1e-10f;
|
|
||||||
for (size_t j = 0; j < cur_p.size; j++) {
|
|
||||||
float prob = cur_p.data[j].p;
|
|
||||||
confidence += prob * logf(prob + epsilon);
|
|
||||||
}
|
|
||||||
} else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
|
|
||||||
confidence = cur_p.data[0].p - cur_p.data[1].p;
|
|
||||||
} else {
|
|
||||||
confidence = cur_p.data[cur_p.selected].p;
|
|
||||||
}
|
|
||||||
|
|
||||||
sampled_tokens[i] = sampled_token;
|
|
||||||
confidences.emplace_back(confidence, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t num_transfer =
|
|
||||||
(step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
|
|
||||||
|
|
||||||
if (num_transfer > 0) {
|
|
||||||
if (params.alg_temp == 0.0f) {
|
|
||||||
std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
|
|
||||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
|
||||||
if (a.first != b.first) {
|
|
||||||
return a.first > b.first;
|
|
||||||
}
|
|
||||||
return a.second < b.second;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
conf_candidates.clear();
|
|
||||||
|
|
||||||
for (int32_t pos = 0; pos < max_length; pos++) {
|
|
||||||
float conf_logit = -std::numeric_limits<float>::infinity();
|
|
||||||
|
|
||||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
|
||||||
if (it != mask_positions.end()) {
|
|
||||||
size_t mask_idx = std::distance(mask_positions.begin(), it);
|
|
||||||
conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling
|
|
||||||
}
|
|
||||||
|
|
||||||
conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array conf_array = {
|
|
||||||
/* .data = */ conf_candidates.data(),
|
|
||||||
/* .size = */ conf_candidates.size(),
|
|
||||||
/* .selected = */ -1,
|
|
||||||
/* .sorted = */ false,
|
|
||||||
};
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < num_transfer; i++) {
|
|
||||||
// Apply distribution sampler to get selected index
|
|
||||||
llama_sampler_apply(dist_sampler, &conf_array);
|
|
||||||
int selected_idx = conf_array.selected;
|
|
||||||
confidences[i].second = conf_candidates[selected_idx].id;
|
|
||||||
|
|
||||||
conf_candidates[selected_idx].p = 0.0f;
|
|
||||||
conf_array.selected = -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.alg_temp == 0.0f) {
|
|
||||||
// Deterministic - use confidence order
|
|
||||||
for (int32_t i = 0; i < num_transfer; i++) {
|
|
||||||
int32_t mask_idx = confidences[i].second;
|
|
||||||
int32_t pos = mask_positions[mask_idx];
|
|
||||||
llama_token token = sampled_tokens[mask_idx];
|
|
||||||
output_tokens[pos] = token;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int32_t i = 0; i < num_transfer; i++) {
|
|
||||||
int32_t pos = confidences[i].second;
|
|
||||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
|
||||||
if (it != mask_positions.end()) {
|
|
||||||
int32_t mask_idx = std::distance(mask_positions.begin(), it);
|
|
||||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int64_t time_end_sampling = ggml_time_us();
|
|
||||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
|
||||||
}
|
|
||||||
int64_t time_end = ggml_time_us();
|
|
||||||
total_time += time_end - time_start;
|
|
||||||
|
|
||||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
|
||||||
total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
|
|
||||||
|
|
||||||
|
|
||||||
llama_batch_free(batch);
|
|
||||||
llama_sampler_free(sampler);
|
|
||||||
llama_sampler_free(dist_sampler);
|
|
||||||
|
|
||||||
n_generated = max_length;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
|
||||||
if (!use_chat_template) {
|
|
||||||
return prompt;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto chat_templates = common_chat_templates_init(model, "");
|
|
||||||
|
|
||||||
common_chat_templates_inputs inputs;
|
|
||||||
common_chat_msg user_msg;
|
|
||||||
user_msg.role = "user";
|
|
||||||
user_msg.content = prompt;
|
|
||||||
inputs.add_generation_prompt = true;
|
|
||||||
inputs.messages.push_back(user_msg);
|
|
||||||
|
|
||||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
|
||||||
|
|
||||||
return result.prompt;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct callback_data {
|
struct callback_data {
|
||||||
const common_params_diffusion * diff_params;
|
diffusion_params * diff_params;
|
||||||
const llama_vocab * vocab;
|
const llama_vocab * vocab;
|
||||||
int32_t n_input;
|
int32_t n_input;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static float calculate_confidence(const llama_token_data_array & cur_p,
|
||||||
|
diffusion_algorithm algorithm,
|
||||||
|
std::mt19937 & rng) {
|
||||||
|
switch (algorithm) {
|
||||||
|
case CONFIDENCE_BASED:
|
||||||
|
return cur_p.data[cur_p.selected].p; // Selected token probability
|
||||||
|
|
||||||
|
case ENTROPY_BASED:
|
||||||
|
{
|
||||||
|
float entropy = 0.0f;
|
||||||
|
const float epsilon = 1e-10f;
|
||||||
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
|
float prob = cur_p.data[i].p;
|
||||||
|
entropy += prob * logf(prob + epsilon);
|
||||||
|
}
|
||||||
|
return -entropy; // Higher entropy = lower confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
case MARGIN_BASED:
|
||||||
|
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
|
||||||
|
|
||||||
|
case RANDOM:
|
||||||
|
{
|
||||||
|
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
|
||||||
|
return uniform(rng); // Random confidence
|
||||||
|
}
|
||||||
|
|
||||||
|
case ORIGIN:
|
||||||
|
return cur_p.data[cur_p.selected].p;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unified transfer count calculation function
|
||||||
|
static int32_t calculate_transfer_count(int32_t step,
|
||||||
|
int32_t total_steps,
|
||||||
|
int32_t remaining_masked,
|
||||||
|
transfer_schedule schedule,
|
||||||
|
float eps,
|
||||||
|
const std::vector<int32_t> & num_transfer_tokens = {}) {
|
||||||
|
switch (schedule) {
|
||||||
|
case TIMESTEP_BASED:
|
||||||
|
{
|
||||||
|
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
|
||||||
|
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
|
||||||
|
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||||
|
return (int32_t) (remaining_masked * p_transfer);
|
||||||
|
}
|
||||||
|
|
||||||
|
case BLOCK_BASED:
|
||||||
|
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
|
||||||
|
return num_transfer_tokens[step];
|
||||||
|
}
|
||||||
|
return remaining_masked / (total_steps - step); // Fallback
|
||||||
|
|
||||||
|
default:
|
||||||
|
return remaining_masked / (total_steps - step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool diffusion_step_callback(int32_t step,
|
static bool diffusion_step_callback(int32_t step,
|
||||||
int32_t total_steps,
|
int32_t total_steps,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
@ -391,6 +175,360 @@ static bool diffusion_step_callback(int32_t step,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
|
||||||
|
if (temperature == 0.0f) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<double> uniform(0.0, 1.0);
|
||||||
|
for (int32_t i = 0; i < n_vocab; i++) {
|
||||||
|
double noise = uniform(rng);
|
||||||
|
// Prevent log(0)
|
||||||
|
noise = std::max(noise, 1e-20);
|
||||||
|
double gumbel_noise = std::pow(-std::log(noise), temperature);
|
||||||
|
logits[i] = std::exp(logits[i]) / gumbel_noise;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
|
||||||
|
std::vector<int32_t> num_transfer_tokens(steps);
|
||||||
|
|
||||||
|
int32_t base = mask_count / steps;
|
||||||
|
int32_t remainder = mask_count % steps;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < steps; i++) {
|
||||||
|
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return num_transfer_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void diffusion_generate(llama_context * ctx,
|
||||||
|
const llama_token * input_tokens,
|
||||||
|
llama_token * output_tokens,
|
||||||
|
int32_t n_input,
|
||||||
|
const diffusion_params & params,
|
||||||
|
int32_t & n_generated) {
|
||||||
|
n_generated = 0;
|
||||||
|
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
|
||||||
|
// Initialize with input and pad with mask tokens
|
||||||
|
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||||
|
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
|
||||||
|
|
||||||
|
std::mt19937 rng(params.seed);
|
||||||
|
|
||||||
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
|
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||||
|
|
||||||
|
std::vector<llama_token_data> candidates(n_vocab);
|
||||||
|
std::vector<llama_token_data> conf_candidates;
|
||||||
|
conf_candidates.reserve(params.max_length);
|
||||||
|
std::vector<int32_t> mask_positions;
|
||||||
|
mask_positions.reserve(params.max_length);
|
||||||
|
|
||||||
|
// Setup sampler chain
|
||||||
|
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||||
|
if (params.top_k > 0) {
|
||||||
|
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||||
|
}
|
||||||
|
if (params.top_p < 1.0f) {
|
||||||
|
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||||
|
}
|
||||||
|
if (params.temperature > 0.0f) {
|
||||||
|
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||||
|
}
|
||||||
|
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||||
|
|
||||||
|
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
|
||||||
|
batch.n_tokens = params.max_length;
|
||||||
|
|
||||||
|
// Pre-allocate buffers for CFG if needed
|
||||||
|
int32_t logits_size = n_vocab * params.max_length;
|
||||||
|
std::vector<float> cond_logits_buffer;
|
||||||
|
std::vector<llama_token> un_x_buffer;
|
||||||
|
if (params.cfg_scale > 0.0f) {
|
||||||
|
cond_logits_buffer.resize(logits_size);
|
||||||
|
un_x_buffer.resize(params.max_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For block-based processing
|
||||||
|
std::vector<int32_t> num_transfer_tokens;
|
||||||
|
int32_t num_blocks = 1;
|
||||||
|
int32_t steps_per_block = params.steps;
|
||||||
|
|
||||||
|
if (params.schedule == BLOCK_BASED) {
|
||||||
|
GGML_ASSERT(params.max_length % params.block_length == 0);
|
||||||
|
num_blocks = params.max_length / params.block_length;
|
||||||
|
GGML_ASSERT(params.steps % num_blocks == 0);
|
||||||
|
steps_per_block = params.steps / num_blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> confidence(params.max_length);
|
||||||
|
|
||||||
|
int64_t total_sampling_time = 0;
|
||||||
|
int64_t total_time = 0;
|
||||||
|
int64_t time_start = ggml_time_us();
|
||||||
|
|
||||||
|
for (int block_num = 0; block_num < num_blocks; block_num++) {
|
||||||
|
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
|
||||||
|
int32_t block_end = (params.schedule == BLOCK_BASED) ?
|
||||||
|
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
|
||||||
|
params.max_length;
|
||||||
|
|
||||||
|
// Count masked tokens in current block for block-based processing
|
||||||
|
if (params.schedule == BLOCK_BASED) {
|
||||||
|
int32_t block_mask_count = 0;
|
||||||
|
for (int i = block_start; i < block_end; i++) {
|
||||||
|
if (output_tokens[i] == params.mask_token_id) {
|
||||||
|
block_mask_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32_t step = 0; step < steps_per_block; step++) {
|
||||||
|
int32_t global_step = block_num * steps_per_block + step;
|
||||||
|
|
||||||
|
if (params.step_callback) {
|
||||||
|
if (!params.step_callback(
|
||||||
|
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup batch
|
||||||
|
for (int32_t i = 0; i < params.max_length; i++) {
|
||||||
|
batch.token[i] = output_tokens[i];
|
||||||
|
batch.pos[i] = i;
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id[i][0] = 0;
|
||||||
|
batch.logits[i] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
float * logits = nullptr;
|
||||||
|
|
||||||
|
if (params.cfg_scale > 0.0f) {
|
||||||
|
int ret = llama_decode(ctx, batch);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_ERR("Failed to generate conditional");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
float * cond_logits_ptr = llama_get_logits(ctx);
|
||||||
|
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
|
||||||
|
|
||||||
|
// Unconditional generation (mask input)
|
||||||
|
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
|
||||||
|
for (int32_t i = 0; i < n_input; i++) {
|
||||||
|
un_x_buffer[i] = params.mask_token_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < params.max_length; i++) {
|
||||||
|
batch.token[i] = un_x_buffer[i];
|
||||||
|
}
|
||||||
|
ret = llama_decode(ctx, batch);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_ERR("Failed to generate unconditional");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
float * uncond_logits = llama_get_logits(ctx);
|
||||||
|
|
||||||
|
// Apply CFG
|
||||||
|
for (int32_t i = 0; i < logits_size; i++) {
|
||||||
|
cond_logits_buffer[i] =
|
||||||
|
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
|
||||||
|
}
|
||||||
|
logits = cond_logits_buffer.data();
|
||||||
|
} else {
|
||||||
|
int ret = llama_decode(ctx, batch);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
logits = llama_get_logits(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!logits) {
|
||||||
|
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||||
|
if (params.shift_logits) {
|
||||||
|
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
|
||||||
|
}
|
||||||
|
return logits + (pos) *n_vocab;
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t time_start_sampling = ggml_time_us();
|
||||||
|
|
||||||
|
mask_positions.clear();
|
||||||
|
for (int32_t i = 0; i < params.max_length; i++) {
|
||||||
|
if (output_tokens[i] == params.mask_token_id) {
|
||||||
|
// For block-based, only consider current block
|
||||||
|
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
|
||||||
|
mask_positions.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mask_positions.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.add_gumbel_noise && params.temperature > 0.0f) {
|
||||||
|
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.algorithm == ORIGIN) {
|
||||||
|
int32_t transfer_count = calculate_transfer_count(
|
||||||
|
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||||
|
float p_transfer = (float) transfer_count / mask_positions.size();
|
||||||
|
|
||||||
|
for (int32_t pos : mask_positions) {
|
||||||
|
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||||
|
const float * pos_logits = get_logits_for_pos(pos);
|
||||||
|
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates[token_id].id = token_id;
|
||||||
|
candidates[token_id].logit = pos_logits[token_id];
|
||||||
|
candidates[token_id].p = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = {
|
||||||
|
candidates.data(),
|
||||||
|
(size_t) n_vocab,
|
||||||
|
-1,
|
||||||
|
false,
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_sampler_apply(sampler, &cur_p);
|
||||||
|
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::vector<std::pair<float, int32_t>> confidences;
|
||||||
|
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||||
|
int32_t pos = mask_positions[i];
|
||||||
|
const float * pos_logits = get_logits_for_pos(pos);
|
||||||
|
|
||||||
|
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates[token_id].logit = pos_logits[token_id];
|
||||||
|
candidates[token_id].p = 0.0f;
|
||||||
|
candidates[token_id].id = token_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = {
|
||||||
|
candidates.data(),
|
||||||
|
candidates.size(),
|
||||||
|
-1,
|
||||||
|
false,
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_sampler_apply(sampler, &cur_p);
|
||||||
|
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
|
float conf = calculate_confidence(cur_p, params.algorithm, rng);
|
||||||
|
|
||||||
|
sampled_tokens[i] = sampled_token;
|
||||||
|
confidences.emplace_back(conf, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t transfer_count = calculate_transfer_count(
|
||||||
|
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||||
|
|
||||||
|
if (transfer_count > 0) {
|
||||||
|
if (params.alg_temp == 0.0f) {
|
||||||
|
std::partial_sort(confidences.begin(),
|
||||||
|
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
|
||||||
|
confidences.end(),
|
||||||
|
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||||
|
if (a.first != b.first) {
|
||||||
|
return a.first > b.first;
|
||||||
|
}
|
||||||
|
return a.second < b.second;
|
||||||
|
});
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||||
|
int32_t mask_idx = confidences[i].second;
|
||||||
|
int32_t pos = mask_positions[mask_idx];
|
||||||
|
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
conf_candidates.clear();
|
||||||
|
for (size_t i = 0; i < confidences.size(); i++) {
|
||||||
|
float conf_logit = confidences[i].first / params.alg_temp;
|
||||||
|
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array conf_array = {
|
||||||
|
conf_candidates.data(),
|
||||||
|
conf_candidates.size(),
|
||||||
|
-1,
|
||||||
|
false,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||||
|
llama_sampler_apply(dist_sampler, &conf_array);
|
||||||
|
int32_t selected_idx = conf_array.selected;
|
||||||
|
int32_t mask_idx = selected_idx;
|
||||||
|
int32_t pos = mask_positions[mask_idx];
|
||||||
|
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||||
|
|
||||||
|
conf_candidates[selected_idx].p = 0.0f;
|
||||||
|
conf_array.selected = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t time_end_sampling = ggml_time_us();
|
||||||
|
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t time_end = ggml_time_us();
|
||||||
|
total_time += time_end - time_start;
|
||||||
|
|
||||||
|
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||||
|
total_time / 1000.0,
|
||||||
|
total_time / 1000.0 / params.steps,
|
||||||
|
total_sampling_time / 1000.0 / params.steps);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
llama_sampler_free(sampler);
|
||||||
|
llama_sampler_free(dist_sampler);
|
||||||
|
|
||||||
|
n_generated = params.max_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||||
|
if (!use_chat_template) {
|
||||||
|
return prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto chat_templates = common_chat_templates_init(model, "");
|
||||||
|
|
||||||
|
common_chat_templates_inputs inputs;
|
||||||
|
common_chat_msg user_msg;
|
||||||
|
user_msg.role = "user";
|
||||||
|
user_msg.content = prompt;
|
||||||
|
inputs.add_generation_prompt = true;
|
||||||
|
inputs.messages.push_back(user_msg);
|
||||||
|
|
||||||
|
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||||
|
|
||||||
|
return result.prompt;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
|
@ -400,11 +538,6 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
|
|
||||||
const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
|
|
||||||
alg_names[params.diffusion.algorithm] :
|
|
||||||
"UNKNOWN";
|
|
||||||
|
|
||||||
common_init();
|
common_init();
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
|
@ -421,6 +554,12 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!llama_model_is_diffusion(model)) {
|
||||||
|
LOG_ERR("error: unsupported model for diffusion");
|
||||||
|
llama_model_free(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
ctx_params.n_ctx = params.n_ctx;
|
ctx_params.n_ctx = params.n_ctx;
|
||||||
ctx_params.n_batch = params.n_batch;
|
ctx_params.n_batch = params.n_batch;
|
||||||
|
@ -442,9 +581,11 @@ int main(int argc, char ** argv) {
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
|
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
|
||||||
|
|
||||||
std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
|
std::vector<llama_token> input_tokens = common_tokenize(vocab,
|
||||||
|
formatted_prompt,
|
||||||
/*add special tokens*/ true,
|
/*add special tokens*/ true,
|
||||||
/*parse special*/ true);
|
/*parse special*/ true);
|
||||||
|
|
||||||
int n_input = input_tokens.size();
|
int n_input = input_tokens.size();
|
||||||
|
|
||||||
if (n_input >= params.n_ctx) {
|
if (n_input >= params.n_ctx) {
|
||||||
|
@ -454,44 +595,79 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct diffusion_params ldiff_params = diffusion_default_params();
|
|
||||||
ldiff_params.steps = params.diffusion.steps;
|
|
||||||
ldiff_params.eps = params.diffusion.eps;
|
|
||||||
ldiff_params.temperature = params.sampling.temp;
|
|
||||||
ldiff_params.top_p = params.sampling.top_p;
|
|
||||||
ldiff_params.top_k = params.sampling.top_k;
|
|
||||||
ldiff_params.algorithm = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
|
|
||||||
ldiff_params.alg_temp = params.diffusion.alg_temp;
|
|
||||||
ldiff_params.seed = params.sampling.seed;
|
|
||||||
|
|
||||||
llama_token mask_token_id = llama_vocab_mask(vocab);
|
llama_token mask_token_id = llama_vocab_mask(vocab);
|
||||||
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
|
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
|
||||||
|
|
||||||
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
|
bool visual_mode = params.diffusion.visual_mode;
|
||||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps);
|
|
||||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps);
|
|
||||||
LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm,
|
|
||||||
alg_name);
|
|
||||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp);
|
|
||||||
|
|
||||||
ldiff_params.mask_token_id = mask_token_id;
|
|
||||||
|
|
||||||
callback_data cb_data = { ¶ms.diffusion, vocab, n_input };
|
|
||||||
|
|
||||||
ldiff_params.step_callback = diffusion_step_callback;
|
|
||||||
ldiff_params.step_callback_user_data = &cb_data;
|
|
||||||
|
|
||||||
int32_t n_generated = 0;
|
int32_t n_generated = 0;
|
||||||
|
|
||||||
std::vector<llama_token> output_tokens(params.n_ubatch);
|
std::vector<llama_token> output_tokens(params.n_ubatch);
|
||||||
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
|
|
||||||
ldiff_params, n_generated);
|
struct diffusion_params diff_params;
|
||||||
|
|
||||||
|
char shift_logits_str[8];
|
||||||
|
if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
|
||||||
|
diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
|
||||||
|
} else {
|
||||||
|
diff_params.shift_logits = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Use either eps or block length, but not both
|
||||||
|
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
|
||||||
|
|
||||||
|
if (params.diffusion.eps) {
|
||||||
|
diff_params.schedule = TIMESTEP_BASED;
|
||||||
|
diff_params.eps = params.diffusion.eps;
|
||||||
|
} else if (params.diffusion.block_length) {
|
||||||
|
diff_params.schedule = BLOCK_BASED;
|
||||||
|
diff_params.block_length = params.diffusion.block_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
diff_params.mask_token_id = mask_token_id;
|
||||||
|
diff_params.seed = params.sampling.seed;
|
||||||
|
diff_params.temperature = params.sampling.temp;
|
||||||
|
diff_params.steps = params.diffusion.steps;
|
||||||
|
diff_params.algorithm = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
|
||||||
|
diff_params.max_length = params.n_ubatch;
|
||||||
|
diff_params.top_p = params.sampling.top_p;
|
||||||
|
diff_params.top_k = params.sampling.top_k;
|
||||||
|
diff_params.visual_mode = params.diffusion.visual_mode;
|
||||||
|
diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
|
||||||
|
|
||||||
|
diff_params.step_callback = diffusion_step_callback;
|
||||||
|
callback_data cb_data = { &diff_params, vocab, n_input };
|
||||||
|
diff_params.step_callback_user_data = &cb_data;
|
||||||
|
|
||||||
|
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
|
||||||
|
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
|
||||||
|
const char * alg_name =
|
||||||
|
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
|
||||||
|
const char * sched_name =
|
||||||
|
(diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
|
||||||
|
|
||||||
|
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
|
||||||
|
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps);
|
||||||
|
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length);
|
||||||
|
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
|
||||||
|
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
|
||||||
|
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
|
||||||
|
if (diff_params.schedule == TIMESTEP_BASED) {
|
||||||
|
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
|
||||||
|
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
|
||||||
|
}
|
||||||
|
if (diff_params.schedule == BLOCK_BASED) {
|
||||||
|
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
|
||||||
|
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
|
||||||
|
|
||||||
if (n_generated > 0) {
|
if (n_generated > 0) {
|
||||||
if (params.diffusion.visual_mode) {
|
if (visual_mode) {
|
||||||
//clear screen and move cursor to top-left
|
//clear screen and move cursor to top-left
|
||||||
LOG_INF("\033[2J\033[H");
|
LOG_INF("\033[2J\033[H");
|
||||||
}
|
}
|
||||||
|
|
||||||
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
|
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
|
||||||
std::string output_data = common_detokenize(vocab, output_tokens, false);
|
std::string output_data = common_detokenize(vocab, output_tokens, false);
|
||||||
LOG_INF("\n%s\n", output_data.c_str());
|
LOG_INF("\n%s\n", output_data.c_str());
|
||||||
|
|
|
@ -37,17 +37,21 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
@ -72,11 +76,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__loongarch64)
|
#elif defined(__loongarch64)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
@ -92,11 +98,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__riscv)
|
#elif defined(__riscv)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
@ -119,10 +127,12 @@
|
||||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__s390x__)
|
#elif defined(__s390x__)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
@ -147,11 +157,13 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#elif defined(__wasm__)
|
#elif defined(__wasm__)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
@ -175,10 +187,12 @@
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
#endif
|
#endif
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
float sumf[8];
|
||||||
|
float sum_minf[8];
|
||||||
|
int sumi1,sumi2,sumi3,sumi4;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
||||||
|
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[j] = 0.0;
|
||||||
|
sum_minf[j] = 0.0;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||||
|
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||||
|
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||||
|
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||||
|
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi3 = 0;
|
||||||
|
sumi4 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
int offset = ((k / 2) % 2) + j * 2;
|
||||||
|
for (int i = 0; i < blocklen; ++i){
|
||||||
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||||
|
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||||
|
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
||||||
|
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
||||||
|
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
||||||
|
|
||||||
|
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||||
|
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||||
|
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||||
|
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||||
|
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||||
|
}
|
||||||
|
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int sb = 0; sb < 8; sb++) {
|
||||||
|
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||||
|
for(int j = 0; j < ncols_interleaved; j++){
|
||||||
|
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
const int ncols_interleaved = 8;
|
||||||
|
const int blocklen = 8;
|
||||||
|
|
||||||
|
assert (n % qk == 0);
|
||||||
|
assert (nr % 4 == 0);
|
||||||
|
assert (nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(s);
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(vx);
|
||||||
|
UNUSED(vy);
|
||||||
|
UNUSED(nr);
|
||||||
|
UNUSED(nc);
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
float sumf[4][8];
|
||||||
|
float sum_minf[4][8];
|
||||||
|
int sumi1, sumi2, sumi3, sumi4;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[m][j] = 0.0;
|
||||||
|
sum_minf[m][j] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||||
|
|
||||||
|
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||||
|
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||||
|
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||||
|
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi3 = 0;
|
||||||
|
sumi4 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
int offset = ((k / 2) % 2) + j * 2;
|
||||||
|
for (int i = 0; i < blocklen; ++i){
|
||||||
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||||
|
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||||
|
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
||||||
|
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
||||||
|
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
||||||
|
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||||
|
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||||
|
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||||
|
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||||
|
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||||
|
}
|
||||||
|
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int sb = 0; sb < 8; sb++) {
|
||||||
|
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||||
|
for(int m = 0; m < 4; m++) {
|
||||||
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||||
|
for(int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
||||||
|
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
@ -916,6 +1083,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
||||||
|
block_q2_Kx8 out;
|
||||||
|
|
||||||
|
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int end = QK_K * 2 / blck_size_interleave;
|
||||||
|
|
||||||
|
// Interleave Q2_K quants by taking 8 bytes at a time
|
||||||
|
for (int i = 0; i < end; ++i) {
|
||||||
|
int src_id = i % 8;
|
||||||
|
int src_offset = (i / 8) * blck_size_interleave;
|
||||||
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
|
uint64_t elems;
|
||||||
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||||
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
||||||
|
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
||||||
|
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
||||||
|
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
||||||
|
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
||||||
|
|
||||||
|
for(int i = 0; i < 128; i++){
|
||||||
|
|
||||||
|
// Index for selecting which q2k super block
|
||||||
|
int src1 = (i % 16) / 2;
|
||||||
|
// Index for selecting scale
|
||||||
|
int src2 = ((i / 16) * 2) + (i % 2);
|
||||||
|
|
||||||
|
out.scales[i] = in[src1].scales[src2];
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it
|
if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it
|
||||||
{
|
{
|
||||||
|
@ -982,6 +1193,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
|
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
||||||
|
GGML_ASSERT(interleave_block == 8);
|
||||||
|
constexpr int nrows_interleaved = 8;
|
||||||
|
|
||||||
|
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
||||||
|
const block_q2_K * src = (const block_q2_K*) data;
|
||||||
|
block_q2_K dst_tmp[8];
|
||||||
|
int nrow = ggml_nrows(t);
|
||||||
|
int nblocks = t->ne[0] / QK_K;
|
||||||
|
|
||||||
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
||||||
|
|
||||||
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||||
|
for (int64_t x = 0; x < nblocks; x++) {
|
||||||
|
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||||
|
dst_tmp[i] = src[x + i * nblocks];
|
||||||
|
}
|
||||||
|
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
||||||
|
}
|
||||||
|
src += nrows_interleaved * nblocks;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(data_size);
|
||||||
|
}
|
||||||
|
|
||||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||||
if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it
|
if(kcpp_q_already_repacked) //using legacy prepacked quant, so just copy it
|
||||||
{
|
{
|
||||||
|
@ -1112,6 +1354,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
||||||
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||||
|
}
|
||||||
|
|
||||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||||
}
|
}
|
||||||
|
@ -1141,6 +1387,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
@ -1165,6 +1415,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
@ -1447,12 +1701,14 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||||
|
|
||||||
|
// instance for Q2
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
||||||
|
|
||||||
// instance for IQ4
|
// instance for IQ4
|
||||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||||
|
|
||||||
if (cur->type == GGML_TYPE_Q4_0) {
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
//we shall just use the regular avx2 handling, no repacking
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||||
if (/*ggml_cpu_has_avx2() ||*/ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
||||||
if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
return &q4_0_8x8_q8_0;
|
return &q4_0_8x8_q8_0;
|
||||||
}
|
}
|
||||||
|
@ -1468,11 +1724,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (cur->type == GGML_TYPE_Q4_K) {
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
||||||
// if (ggml_cpu_has_avx2()) {
|
if (ggml_cpu_has_avx2()) {
|
||||||
// if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
// return &q4_K_8x8_q8_K;
|
return &q4_K_8x8_q8_K;
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
|
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||||
|
if (ggml_cpu_has_avx512()) {
|
||||||
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
return &q2_K_8x8_q8_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
if (cur->ne[1] % 4 == 0) {
|
if (cur->ne[1] % 4 == 0) {
|
||||||
|
|
|
@ -44,7 +44,14 @@ struct block_q4_Kx8 {
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||||
|
struct block_q2_Kx8 {
|
||||||
|
ggml_half d[8]; // super-block scale for quantized scales
|
||||||
|
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
||||||
|
uint8_t qs[512]; // 2--bit quants
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||||
struct block_q8_Kx4 {
|
struct block_q8_Kx4 {
|
||||||
float d[4]; // delta
|
float d[4]; // delta
|
||||||
int8_t qs[QK_K * 4]; // quants
|
int8_t qs[QK_K * 4]; // quants
|
||||||
|
@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
// Native implementations
|
// Native implementations
|
||||||
|
@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
|
|
|
@ -231,9 +231,9 @@ typedef float2 dfloat2;
|
||||||
#define FP16_MMA_AVAILABLE
|
#define FP16_MMA_AVAILABLE
|
||||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
#if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
#define AMD_MFMA_AVAILABLE
|
#define AMD_MFMA_AVAILABLE
|
||||||
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||||
#define NEW_MMA_AVAILABLE
|
#define NEW_MMA_AVAILABLE
|
||||||
|
@ -297,10 +297,9 @@ static bool fp32_mma_hardware_available(const int cc) {
|
||||||
return GGML_CUDA_CC_IS_CDNA(cc);
|
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
|
||||||
static bool amd_mfma_available(const int cc) {
|
static bool amd_mfma_available(const int cc) {
|
||||||
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
return GGML_CUDA_CC_IS_CDNA3(cc);
|
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
|
@ -436,6 +435,20 @@ static __global__ void reduce_rows_f32(const float * x, float * dst, const int n
|
||||||
dst[row] = norm ? sum / ncols : sum;
|
dst[row] = norm ? sum / ncols : sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int width = WARP_SIZE>
|
||||||
|
static __device__ __forceinline__ int warp_reduce_all(int x) {
|
||||||
|
#ifdef GGML_USE_HIP
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||||
|
x = x && __shfl_xor_sync(0xffffffff, x, offset, width);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
#else
|
||||||
|
static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented");
|
||||||
|
return __all_sync(0xffffffff, x);
|
||||||
|
#endif // GGML_USE_HIP
|
||||||
|
}
|
||||||
|
|
||||||
template<int width = WARP_SIZE>
|
template<int width = WARP_SIZE>
|
||||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|
|
@ -15,6 +15,7 @@ typedef void (* fattn_kernel_t)(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -500,6 +501,55 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||||
nullptr;
|
nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int ncols1>
|
||||||
|
__launch_bounds__(FATTN_KQ_STRIDE/2, 1)
|
||||||
|
static __global__ void flash_attn_mask_to_KV_max(
|
||||||
|
const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
|
||||||
|
const int ne31 = gridDim.x;
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int sequence = blockIdx.y;
|
||||||
|
const int jt = blockIdx.x;
|
||||||
|
|
||||||
|
mask += sequence*s33 + jt*ncols1*s31;
|
||||||
|
|
||||||
|
__shared__ int buf_iw[WARP_SIZE];
|
||||||
|
if (tid < WARP_SIZE) {
|
||||||
|
buf_iw[tid] = 1;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE;
|
||||||
|
for (; KV_max_sj >= 0; KV_max_sj -= FATTN_KQ_STRIDE) {
|
||||||
|
int all_inf = 1;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols1; ++j) {
|
||||||
|
const float2 tmp = __half22float2(mask[j*s31 + KV_max_sj/2 + tid]);
|
||||||
|
all_inf = all_inf && int(isinf(tmp.x)) && int(isinf(tmp.y));
|
||||||
|
}
|
||||||
|
|
||||||
|
all_inf = warp_reduce_all(all_inf);
|
||||||
|
if (tid % WARP_SIZE == 0) {
|
||||||
|
buf_iw[tid / WARP_SIZE] = all_inf;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
all_inf = buf_iw[tid % WARP_SIZE];
|
||||||
|
__syncthreads();
|
||||||
|
all_inf = warp_reduce_all(all_inf);
|
||||||
|
|
||||||
|
if (!all_inf) {
|
||||||
|
KV_max_sj += FATTN_KQ_STRIDE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
KV_max[sequence*ne31 + jt] = KV_max_sj;
|
||||||
|
}
|
||||||
|
|
||||||
template<int D, int ncols1, int ncols2> // D == head size
|
template<int D, int ncols1, int ncols2> // D == head size
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
static __global__ void flash_attn_stream_k_fixup(
|
static __global__ void flash_attn_stream_k_fixup(
|
||||||
|
@ -711,6 +761,7 @@ void launch_fattn(
|
||||||
|
|
||||||
ggml_cuda_pool_alloc<half> K_f16(pool);
|
ggml_cuda_pool_alloc<half> K_f16(pool);
|
||||||
ggml_cuda_pool_alloc<half> V_f16(pool);
|
ggml_cuda_pool_alloc<half> V_f16(pool);
|
||||||
|
ggml_cuda_pool_alloc<int> KV_max(pool);
|
||||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||||
|
|
||||||
|
@ -779,11 +830,30 @@ void launch_fattn(
|
||||||
V_data = (char *) V_f16.ptr;
|
V_data = (char *) V_f16.ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int parallel_blocks = 1;
|
|
||||||
|
|
||||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||||
|
|
||||||
|
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
||||||
|
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||||
|
// multiple sequences of possibly different lengths.
|
||||||
|
if (mask && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
|
||||||
|
const int s31 = mask->nb[1] / sizeof(half2);
|
||||||
|
const int s33 = mask->nb[3] / sizeof(half2);
|
||||||
|
|
||||||
|
const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
|
||||||
|
const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
|
||||||
|
|
||||||
|
const int ne_KV_max = blocks_num_KV_max.x*blocks_num_KV_max.y;
|
||||||
|
const int iter_k = K->ne[1] / FATTN_KQ_STRIDE;
|
||||||
|
|
||||||
|
KV_max.alloc(ne_KV_max);
|
||||||
|
flash_attn_mask_to_KV_max<ncols1><<<blocks_num_KV_max, block_dim_KV_max, 0, main_stream>>>
|
||||||
|
((const half2 *) mask->data, KV_max.ptr, iter_k, s31, s33);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
}
|
||||||
|
|
||||||
|
int parallel_blocks = 1;
|
||||||
|
|
||||||
const dim3 block_dim(warp_size, nwarps, 1);
|
const dim3 block_dim(warp_size, nwarps, 1);
|
||||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||||
|
@ -870,6 +940,7 @@ void launch_fattn(
|
||||||
K_data,
|
K_data,
|
||||||
V_data,
|
V_data,
|
||||||
mask ? ((const char *) mask->data) : nullptr,
|
mask ? ((const char *) mask->data) : nullptr,
|
||||||
|
KV_max.ptr,
|
||||||
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
||||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
||||||
|
|
|
@ -392,7 +392,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles, bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup, bool last_iter>
|
template<int DKQ, int DV, int ncols1, int ncols2, int nwarps, int ntiles,
|
||||||
|
bool use_logit_softcap, bool mla, bool needs_fixup, bool is_fixup, bool last_iter>
|
||||||
static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||||
const float2 * const __restrict__ Q_f2,
|
const float2 * const __restrict__ Q_f2,
|
||||||
const half2 * const __restrict__ K_h2,
|
const half2 * const __restrict__ K_h2,
|
||||||
|
@ -922,7 +923,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Iterate over ne11 == previous tokens:
|
// Iterate over ne11 == previous tokens:
|
||||||
for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) {
|
int kb0 = kb0_start;
|
||||||
|
for (; kb0 < kb0_stop-1; ++kb0) {
|
||||||
constexpr bool last_iter = false;
|
constexpr bool last_iter = false;
|
||||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||||
|
@ -932,7 +934,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||||
constexpr bool last_iter = true;
|
constexpr bool last_iter = true;
|
||||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// With multi-stage loading there is no __syncthreads at the end of the iter,
|
// With multi-stage loading there is no __syncthreads at the end of the iter,
|
||||||
|
@ -1204,6 +1206,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -1280,7 +1283,11 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||||
|
|
||||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||||
const int kb0_stop_kernel = kb0_stop * kb_niter;
|
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||||
|
|
||||||
|
if (KV_max) {
|
||||||
|
kb0_stop_kernel = min(kb0_stop_kernel, KV_max[sequence*iter_j + jt] / c::nbatch_fa);
|
||||||
|
}
|
||||||
|
|
||||||
constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||||
if (kb0_start == 0) {
|
if (kb0_start == 0) {
|
||||||
|
@ -1321,7 +1328,11 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
|
||||||
|
|
||||||
const int kb0_start_kernel = kb0_start * kb_niter;
|
const int kb0_start_kernel = kb0_start * kb_niter;
|
||||||
const int kb0_stop_kernel = kb0_stop * kb_niter;
|
int kb0_stop_kernel = kb0_stop * kb_niter;
|
||||||
|
|
||||||
|
if (KV_max) {
|
||||||
|
kb0_stop_kernel = min(kb0_stop_kernel, KV_max[sequence*iter_j + jt] / c::nbatch_fa);
|
||||||
|
}
|
||||||
|
|
||||||
constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||||
constexpr bool needs_fixup = false;
|
constexpr bool needs_fixup = false;
|
||||||
|
|
|
@ -13,6 +13,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -90,7 +91,8 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||||
|
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
|
||||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
|
|
||||||
half kqmax_new[ncols/nwarps];
|
half kqmax_new[ncols/nwarps];
|
||||||
|
|
|
@ -13,6 +13,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -99,7 +100,8 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||||
|
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
|
||||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
|
|
||||||
float kqmax_new[ncols/nwarps];
|
float kqmax_new[ncols/nwarps];
|
||||||
|
|
|
@ -16,6 +16,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -177,10 +178,11 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
|
||||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||||
K += blockIdx.y*D * nb11;
|
K += blockIdx.y*D * nb11;
|
||||||
V += blockIdx.y*D * nb21;
|
V += blockIdx.y*D * nb21;
|
||||||
maskh += blockIdx.y*D;
|
maskh += blockIdx.y*D;
|
||||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
|
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||||
// Increment pointers after each loop:
|
// Increment pointers after each loop:
|
||||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||||
|
|
||||||
|
@ -191,29 +193,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
for (int j = 0; j < ncols; ++j) {
|
for (int j = 0; j < ncols; ++j) {
|
||||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
|
||||||
// In such cases, skip the KV slice.
|
|
||||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
|
||||||
#ifndef GGML_USE_HIP
|
|
||||||
bool skip = true;
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
|
||||||
const int i = i0 + threadIdx.x;
|
|
||||||
|
|
||||||
const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
|
|
||||||
skip = skip && isinf(tmp.x) && isinf(tmp.y);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
|
||||||
__syncthreads();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_HIP
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||||
|
|
|
@ -16,6 +16,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -183,10 +184,11 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
|
|
||||||
float VKQ[ncols] = {0.0f};
|
float VKQ[ncols] = {0.0f};
|
||||||
|
|
||||||
|
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||||
K += blockIdx.y*D * nb11;
|
K += blockIdx.y*D * nb11;
|
||||||
V += blockIdx.y*D * nb21;
|
V += blockIdx.y*D * nb21;
|
||||||
maskh += blockIdx.y*D;
|
maskh += blockIdx.y*D;
|
||||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D,
|
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
|
||||||
// Increment pointers after each loop:
|
// Increment pointers after each loop:
|
||||||
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
|
||||||
|
|
||||||
|
@ -197,28 +199,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
for (int j = 0; j < ncols; ++j) {
|
for (int j = 0; j < ncols; ++j) {
|
||||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
|
||||||
// In such cases, skip the KV slice.
|
|
||||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
|
||||||
#ifndef GGML_USE_HIP
|
|
||||||
bool skip = true;
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
|
|
||||||
const int i = i0 + threadIdx.x;
|
|
||||||
|
|
||||||
skip = skip && isinf(maskf_shared[j*D + i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
|
||||||
__syncthreads();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_HIP
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float kqmax_new_arr[ncols];
|
float kqmax_new_arr[ncols];
|
||||||
|
|
|
@ -29,6 +29,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
const char * __restrict__ V,
|
const char * __restrict__ V,
|
||||||
const char * __restrict__ mask,
|
const char * __restrict__ mask,
|
||||||
|
const int * __restrict__ KV_max,
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
@ -165,7 +166,8 @@ static __global__ void flash_attn_ext_f16(
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Iterate over ne11 == previous tokens:
|
// Iterate over ne11 == previous tokens:
|
||||||
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
|
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
|
||||||
|
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
|
||||||
// Calculate tile of KQ:
|
// Calculate tile of KQ:
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
|
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
|
||||||
|
|
|
@ -315,7 +315,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
|
|
||||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
|
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies &&
|
||||||
|
(Q->ne[3] > 1 || cc < GGML_CUDA_CC_ADA_LOVELACE) && !mma_needs_data_conversion;
|
||||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||||
if (prec == GGML_PREC_DEFAULT) {
|
if (prec == GGML_PREC_DEFAULT) {
|
||||||
|
|
|
@ -109,8 +109,8 @@ void ggml_cuda_mul_mat_q(
|
||||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||||
|
|
||||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
|
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||||
|
|
||||||
if (!ids) {
|
if (!ids) {
|
||||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||||
|
@ -252,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
|
|| GGML_CUDA_CC_IS_CDNA(cc))
|
||||||
&& src1_ncols == ne11;
|
&& src1_ncols == ne11;
|
||||||
const mmq_args args = {
|
const mmq_args args = {
|
||||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||||
|
@ -308,7 +308,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_mma_available(cc) || amd_mfma_available(cc)) {
|
if (new_mma_available(cc)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -324,5 +324,21 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (amd_mfma_available(cc)) {
|
||||||
|
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
|
||||||
|
// performs better but is currently suffering from a crash on this architecture.
|
||||||
|
// TODO: Revisit when hipblaslt is fixed on CDNA3
|
||||||
|
if (GGML_CUDA_CC_IS_CDNA3(cc)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -252,25 +252,21 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
|
||||||
#endif // AMD_MFMA_AVAILABLE
|
#endif // AMD_MFMA_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP)
|
#if defined(GGML_USE_HIP)
|
||||||
static int mmq_get_nwarps_host(const int cc) {
|
static int mmq_get_nwarps_host(const int cc, const int warp_size) {
|
||||||
return amd_mfma_available(cc) ? 8 : 4;
|
return amd_mfma_available(cc) ? 8 : 256/warp_size;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static int mmq_get_nwarps_host(const int /*cc*/) {
|
static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
|
||||||
return 8;
|
return 256/warp_size;
|
||||||
}
|
}
|
||||||
#endif // (GGML_USE_HIP)
|
#endif // (GGML_USE_HIP)
|
||||||
|
|
||||||
static constexpr __device__ int mmq_get_nwarps_device() {
|
static constexpr __device__ int mmq_get_nwarps_device() {
|
||||||
#if defined(GGML_USE_HIP)
|
|
||||||
#if defined(AMD_MFMA_AVAILABLE)
|
#if defined(AMD_MFMA_AVAILABLE)
|
||||||
return 8;
|
return 8;
|
||||||
#else
|
#else
|
||||||
return 4;
|
return 256/ggml_cuda_get_physical_warp_size();
|
||||||
#endif // AMD_MFMA_AVAILABLE
|
#endif // AMD_MFMA_AVAILABLE
|
||||||
#else
|
|
||||||
return 8;
|
|
||||||
#endif // defined(GGML_USE_HIP)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ------------------------------------------------------------
|
// ------------------------------------------------------------
|
||||||
|
@ -3097,8 +3093,8 @@ static __global__ void mul_mat_q(
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||||
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||||
{
|
{
|
||||||
const int wt = blockIdx.z / nchannels_y;
|
const int wt = blockIdx.z / nchannels_y;
|
||||||
const int zt = blockIdx.z - wt*nchannels_y;
|
const int zt = blockIdx.z - wt*nchannels_y;
|
||||||
|
@ -3473,7 +3469,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||||
const int nwarps = mmq_get_nwarps_host(cc);
|
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
const int mmq_y = get_mmq_y_host(cc);
|
||||||
|
|
||||||
const dim3 block_dims(warp_size, nwarps, 1);
|
const dim3 block_dims(warp_size, nwarps, 1);
|
||||||
|
@ -3560,7 +3556,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||||
const int nwarps = mmq_get_nwarps_host(cc);
|
const int nwarps = mmq_get_nwarps_host(cc, warp_size);
|
||||||
|
|
||||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
const int mmq_y = get_mmq_y_host(cc);
|
||||||
|
|
|
@ -70,3 +70,69 @@ kernel void kernel_div_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] / src1[idx1];
|
dst[gid] = src0[gid] / src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_div_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_div_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] / src1[idx1];
|
||||||
|
}
|
||||||
|
|
132
ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
Normal file
132
ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#define LOAD_VEC_A 4
|
||||||
|
#define LOAD_VEC_B 4
|
||||||
|
|
||||||
|
#define BM 64
|
||||||
|
#define BN 64
|
||||||
|
#define BK 16
|
||||||
|
#define TM 4
|
||||||
|
#define TN 8
|
||||||
|
|
||||||
|
kernel void kernel_mul_mm_f16_f32_l4_lm(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
|
||||||
|
int stride_a,
|
||||||
|
int stride_b,
|
||||||
|
int stride_d,
|
||||||
|
|
||||||
|
int batch_stride_a,
|
||||||
|
int batch_stride_b,
|
||||||
|
int batch_stride_d,
|
||||||
|
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global float4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
local half buf_a[BM * BK];
|
||||||
|
local float buf_b[BN * BK];
|
||||||
|
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
const int i13 = batch_idx / ne12;
|
||||||
|
const int i12 = batch_idx % ne12;
|
||||||
|
|
||||||
|
const int i03 = i13 / r3;
|
||||||
|
const int i02 = i12 / r2;
|
||||||
|
|
||||||
|
const int batch_idx_a = i03 * ne02 + i02;
|
||||||
|
|
||||||
|
const int ir = get_group_id(0);
|
||||||
|
const int ic = get_group_id(1);
|
||||||
|
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int th_r = tid % (BM / TM);
|
||||||
|
const int th_c = tid / (BM / TM);
|
||||||
|
|
||||||
|
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||||
|
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||||
|
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||||
|
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||||
|
|
||||||
|
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||||
|
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||||
|
|
||||||
|
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||||
|
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||||
|
|
||||||
|
float sums[TM * TN];
|
||||||
|
half cache_a[TM];
|
||||||
|
float cache_b[TN];
|
||||||
|
|
||||||
|
for (int i = 0; i < TM * TN; i++) {
|
||||||
|
sums[i] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = 0; block < ne00; block += BK) {
|
||||||
|
for (int l = 0; l < BM; l += loadstride_a) {
|
||||||
|
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int l = 0; l < BN; l += loadstride_b) {
|
||||||
|
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
pos_a += BK / LOAD_VEC_A;
|
||||||
|
pos_b += BK / LOAD_VEC_B;
|
||||||
|
|
||||||
|
for (int i = 0; i < BK; i++) {
|
||||||
|
for (int j = 0; j < TM; j++) {
|
||||||
|
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||||
|
}
|
||||||
|
for (int j = 0; j < TN; j++) {
|
||||||
|
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int cc = 0; cc < TN; cc++) {
|
||||||
|
for (int cr = 0; cr < TM; cr++) {
|
||||||
|
const int sums_idx = cc*TM + cr;
|
||||||
|
sums[sums_idx] = mad(convert_float(cache_a[cr]), cache_b[cc], sums[sums_idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int dr = ir * BM + th_r * TM;
|
||||||
|
const int dc = ic * BN + th_c * TN;
|
||||||
|
|
||||||
|
const int offsets = batch_idx * batch_stride_d;
|
||||||
|
|
||||||
|
for (int cc = 0; cc < TN; cc++) {
|
||||||
|
for (int cr = 0; cr < TM; cr++) {
|
||||||
|
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||||
|
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
133
ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
Normal file
133
ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#define LOAD_VEC_A 4
|
||||||
|
#define LOAD_VEC_B 4
|
||||||
|
|
||||||
|
#define BM 64
|
||||||
|
#define BN 64
|
||||||
|
#define BK 16
|
||||||
|
#define TM 4
|
||||||
|
#define TN 8
|
||||||
|
|
||||||
|
kernel void kernel_mul_mm_f32_f32_l4_lm(
|
||||||
|
global float4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
|
||||||
|
int stride_a,
|
||||||
|
int stride_b,
|
||||||
|
int stride_d,
|
||||||
|
|
||||||
|
int batch_stride_a,
|
||||||
|
int batch_stride_b,
|
||||||
|
int batch_stride_d,
|
||||||
|
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
src0 = (global float4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global float4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
local float buf_a[BM * BK];
|
||||||
|
local float buf_b[BN * BK];
|
||||||
|
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
const int i13 = batch_idx / ne12;
|
||||||
|
const int i12 = batch_idx % ne12;
|
||||||
|
|
||||||
|
const int i03 = i13 / r3;
|
||||||
|
const int i02 = i12 / r2;
|
||||||
|
|
||||||
|
const int batch_idx_a = i03 * ne02 + i02;
|
||||||
|
|
||||||
|
const int ir = get_group_id(0);
|
||||||
|
const int ic = get_group_id(1);
|
||||||
|
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
const int th_r = tid % (BM / TM);
|
||||||
|
const int th_c = tid / (BM / TM);
|
||||||
|
|
||||||
|
const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
|
||||||
|
const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
|
||||||
|
const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
|
||||||
|
const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
|
||||||
|
|
||||||
|
const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
|
||||||
|
const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
|
||||||
|
|
||||||
|
int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
|
||||||
|
int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
|
||||||
|
|
||||||
|
float sums[TM * TN];
|
||||||
|
float cache_a[TM];
|
||||||
|
float cache_b[TN];
|
||||||
|
|
||||||
|
for (int i = 0; i < TM * TN; i++) {
|
||||||
|
sums[i] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = 0; block < ne00; block += BK) {
|
||||||
|
for (int l = 0; l < BM; l += loadstride_a) {
|
||||||
|
const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
|
||||||
|
buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = src0[idx].s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int l = 0; l < BN; l += loadstride_b) {
|
||||||
|
const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
|
||||||
|
buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
pos_a += BK / LOAD_VEC_A;
|
||||||
|
pos_b += BK / LOAD_VEC_B;
|
||||||
|
|
||||||
|
for (int i = 0; i < BK; i++) {
|
||||||
|
for (int j = 0; j < TM; j++) {
|
||||||
|
cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < TN; j++) {
|
||||||
|
cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int cc = 0; cc < TN; cc++) {
|
||||||
|
for (int cr = 0; cr < TM; cr++) {
|
||||||
|
const int sums_idx = cc*TM + cr;
|
||||||
|
sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int dr = ir * BM + th_r * TM;
|
||||||
|
const int dc = ic * BN + th_c * TN;
|
||||||
|
|
||||||
|
const int offsets = batch_idx * batch_stride_d;
|
||||||
|
|
||||||
|
for (int cc = 0; cc < TN; cc++) {
|
||||||
|
for (int cr = 0; cr < TM; cr++) {
|
||||||
|
if (dr + cr < ne01 && dc + cc < ne11) {
|
||||||
|
dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -70,3 +70,69 @@ kernel void kernel_sub_row(
|
||||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
dst[gid] = src0[gid] - src1[idx1];
|
dst[gid] = src0[gid] - src1[idx1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sub_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sub_row_f16(
|
||||||
|
global half4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global half4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global half4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global half4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global half4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] - src1[idx1];
|
||||||
|
}
|
||||||
|
|
|
@ -1357,7 +1357,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
vk::DebugUtilsObjectNameInfoEXT duoni;
|
vk::DebugUtilsObjectNameInfoEXT duoni;
|
||||||
duoni.objectType = vk::ObjectType::ePipeline;
|
duoni.objectType = vk::ObjectType::ePipeline;
|
||||||
duoni.pObjectName = pipeline->name.c_str();
|
duoni.pObjectName = pipeline->name.c_str();
|
||||||
duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
|
duoni.objectHandle = /*reinterpret_cast*/(uint64_t)(static_cast<VkPipeline>(pipeline->pipeline));
|
||||||
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
|
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5249,9 +5249,9 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||||
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||||
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
std::cerr << "), " << (dryrun ? "dryrun" : "") << ")");
|
||||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT
|
||||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
||||||
|
@ -11192,7 +11192,7 @@ size_t comp_nb[GGML_MAX_DIMS];
|
||||||
size_t check_counter = 0;
|
size_t check_counter = 0;
|
||||||
static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||||
if (tensor->op == GGML_OP_TRANSPOSE) {
|
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11312,7 +11312,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||||
tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
|
tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
|
||||||
} else if (tensor->op == GGML_OP_SCALE) {
|
} else if (tensor->op == GGML_OP_SCALE) {
|
||||||
const float * params = (const float *)tensor->op_params;
|
const float * params = (const float *)tensor->op_params;
|
||||||
tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
|
tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||||
} else if (tensor->op == GGML_OP_SQR) {
|
} else if (tensor->op == GGML_OP_SQR) {
|
||||||
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
||||||
} else if (tensor->op == GGML_OP_SIN) {
|
} else if (tensor->op == GGML_OP_SIN) {
|
||||||
|
@ -11423,8 +11423,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||||
} else {
|
} else {
|
||||||
tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
|
tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
|
||||||
}
|
}
|
||||||
} else if (tensor->op == GGML_OP_SET_ROWS) {
|
|
||||||
tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]);
|
|
||||||
} else if (tensor->op == GGML_OP_CONT) {
|
} else if (tensor->op == GGML_OP_CONT) {
|
||||||
tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||||
} else if (tensor->op == GGML_OP_RESHAPE) {
|
} else if (tensor->op == GGML_OP_RESHAPE) {
|
||||||
|
@ -11532,7 +11530,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||||
|
|
||||||
static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||||
if (tensor->op == GGML_OP_TRANSPOSE) {
|
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
bool fused_rms_norm_mul = false;
|
bool fused_rms_norm_mul = false;
|
||||||
|
@ -11592,6 +11590,9 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||||
} else if (tensor->type == GGML_TYPE_F16) {
|
} else if (tensor->type == GGML_TYPE_F16) {
|
||||||
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
correct = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
||||||
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
result = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||||
|
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||||
|
correct = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]));
|
||||||
|
result = ggml_bf16_to_fp32(*(ggml_bf16_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||||
} else if (tensor->type == GGML_TYPE_I32) {
|
} else if (tensor->type == GGML_TYPE_I32) {
|
||||||
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
correct = *(int32_t *) ((char *) comp_result + i3*comp_nb[3] + i2*comp_nb[2] + i1*comp_nb[1] + i0*comp_nb[0]);
|
||||||
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
result = *(int32_t *) ((char *) tensor_data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||||
|
|
|
@ -279,6 +279,9 @@ class Keys:
|
||||||
class Projector:
|
class Projector:
|
||||||
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
||||||
|
|
||||||
|
class Diffusion:
|
||||||
|
SHIFT_LOGITS = "diffusion.shift_logits"
|
||||||
|
|
||||||
#
|
#
|
||||||
# recommended mapping of model tensor names for storage in gguf
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
#
|
#
|
||||||
|
@ -373,10 +376,12 @@ class MODEL_ARCH(IntEnum):
|
||||||
ERNIE4_5 = auto()
|
ERNIE4_5 = auto()
|
||||||
ERNIE4_5_MOE = auto()
|
ERNIE4_5_MOE = auto()
|
||||||
HUNYUAN_MOE = auto()
|
HUNYUAN_MOE = auto()
|
||||||
|
HUNYUAN_DENSE = auto()
|
||||||
SMOLLM3 = auto()
|
SMOLLM3 = auto()
|
||||||
LFM2 = auto()
|
LFM2 = auto()
|
||||||
DREAM = auto()
|
DREAM = auto()
|
||||||
SMALLTHINKER = auto()
|
SMALLTHINKER = auto()
|
||||||
|
LLADA = auto()
|
||||||
|
|
||||||
|
|
||||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||||
|
@ -693,10 +698,12 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
||||||
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
||||||
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
||||||
|
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
||||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||||
MODEL_ARCH.LFM2: "lfm2",
|
MODEL_ARCH.LFM2: "lfm2",
|
||||||
MODEL_ARCH.DREAM: "dream",
|
MODEL_ARCH.DREAM: "dream",
|
||||||
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
||||||
|
MODEL_ARCH.LLADA: "llada",
|
||||||
}
|
}
|
||||||
|
|
||||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||||
|
@ -1318,6 +1325,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.LLADA: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.QWEN2VL: [
|
MODEL_ARCH.QWEN2VL: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -2451,6 +2473,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.HUNYUAN_DENSE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.SMOLLM3: [
|
MODEL_ARCH.SMOLLM3: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -1047,6 +1047,11 @@ class GGUFWriter:
|
||||||
def add_audio_stack_factor(self, value: int) -> None:
|
def add_audio_stack_factor(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
||||||
|
|
||||||
|
# diffusion models
|
||||||
|
|
||||||
|
def add_diffusion_shift_logits(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -32,6 +32,7 @@ class TensorNameMap:
|
||||||
"model.word_embeddings", # bailingmoe
|
"model.word_embeddings", # bailingmoe
|
||||||
"language_model.model.embed_tokens", # llama4
|
"language_model.model.embed_tokens", # llama4
|
||||||
"encoder", # neobert
|
"encoder", # neobert
|
||||||
|
"model.transformer.wte", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -71,6 +72,7 @@ class TensorNameMap:
|
||||||
"head", # rwkv
|
"head", # rwkv
|
||||||
"head.out", # wavtokenizer
|
"head.out", # wavtokenizer
|
||||||
"lm_head", # llama4
|
"lm_head", # llama4
|
||||||
|
"model.transformer.ff_out", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
|
@ -94,6 +96,7 @@ class TensorNameMap:
|
||||||
"model.ln_out", # rwkv7
|
"model.ln_out", # rwkv7
|
||||||
"backbone.final_layer_norm", # wavtokenizer
|
"backbone.final_layer_norm", # wavtokenizer
|
||||||
"model.norm", # llama4
|
"model.norm", # llama4
|
||||||
|
"model.transformer.ln_f", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -139,6 +142,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.input_layernorm", # llama4
|
"model.layers.{bid}.input_layernorm", # llama4
|
||||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||||
"model.layers.{bid}.operator_norm", # lfm2
|
"model.layers.{bid}.operator_norm", # lfm2
|
||||||
|
"model.transformer.blocks.{bid}.attn_norm", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -183,6 +187,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
||||||
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama4
|
"model.layers.{bid}.self_attn.q_proj", # llama4
|
||||||
|
"model.transformer.blocks.{bid}.q_proj", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
|
@ -199,6 +204,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
||||||
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama4
|
"model.layers.{bid}.self_attn.k_proj", # llama4
|
||||||
|
"model.transformer.blocks.{bid}.k_proj", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
|
@ -214,6 +220,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
||||||
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama4
|
"model.layers.{bid}.self_attn.v_proj", # llama4
|
||||||
|
"model.transformer.blocks.{bid}.v_proj", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
|
@ -246,6 +253,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||||
"transformer_encoder.{bid}.wo", # neobert
|
"transformer_encoder.{bid}.wo", # neobert
|
||||||
|
"model.transformer.blocks.{bid}.attn_out", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
|
@ -291,6 +299,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||||
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
"model.layers.layers.{bid}.pre_mlp_norm", # plamo2
|
||||||
|
"model.transformer.blocks.{bid}.ff_norm", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
|
@ -364,6 +373,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid
|
||||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||||
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.up", # smallthinker
|
||||||
|
"model.transformer.blocks.{bid}.up_proj", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
@ -405,6 +415,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||||
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.gate", # smallthinker
|
||||||
|
"model.transformer.blocks.{bid}.ff_proj", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
@ -454,6 +465,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid
|
||||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||||
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
"model.layers.{bid}.block_sparse_moe.down", # smallthinker
|
||||||
|
"model.transformer.blocks.{bid}.ff_out", # llada
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
@ -604,6 +616,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT_NORM: (
|
MODEL_TENSOR.SSM_DT_NORM: (
|
||||||
|
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
|
||||||
"model.layers.{bid}.mamba.dt_layernorm", # jamba
|
"model.layers.{bid}.mamba.dt_layernorm", # jamba
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -633,10 +646,6 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.mixer.D", # plamo2
|
"model.layers.layers.{bid}.mixer.D", # plamo2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT_NORM: (
|
|
||||||
"model.layers.layers.{bid}.mixer.dt_norm.weight", # plamo2
|
|
||||||
),
|
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_NORM: (
|
MODEL_TENSOR.SSM_NORM: (
|
||||||
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
|
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
|
||||||
"backbone.layers.{bid}.mixer.norm", # mamba2
|
"backbone.layers.{bid}.mixer.norm", # mamba2
|
||||||
|
|
|
@ -291,6 +291,7 @@ extern "C" {
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
@ -540,6 +541,9 @@ extern "C" {
|
||||||
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||||
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
||||||
|
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API uint32_t llama_model_quantize(
|
LLAMA_API uint32_t llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
|
|
|
@ -85,10 +85,12 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
||||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||||
|
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
||||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||||
{ LLM_ARCH_LFM2, "lfm2" },
|
{ LLM_ARCH_LFM2, "lfm2" },
|
||||||
{ LLM_ARCH_DREAM, "dream" },
|
{ LLM_ARCH_DREAM, "dream" },
|
||||||
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
||||||
|
{ LLM_ARCH_LLADA, "llada" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1896,6 +1898,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_HUNYUAN_DENSE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_SMOLLM3,
|
LLM_ARCH_SMOLLM3,
|
||||||
{
|
{
|
||||||
|
@ -1972,6 +1994,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_LLADA,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2224,6 +2263,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||||
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
||||||
switch (arch) {
|
switch (arch) {
|
||||||
case LLM_ARCH_DREAM:
|
case LLM_ARCH_DREAM:
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -89,10 +89,12 @@ enum llm_arch {
|
||||||
LLM_ARCH_ERNIE4_5,
|
LLM_ARCH_ERNIE4_5,
|
||||||
LLM_ARCH_ERNIE4_5_MOE,
|
LLM_ARCH_ERNIE4_5_MOE,
|
||||||
LLM_ARCH_HUNYUAN_MOE,
|
LLM_ARCH_HUNYUAN_MOE,
|
||||||
|
LLM_ARCH_HUNYUAN_DENSE,
|
||||||
LLM_ARCH_SMOLLM3,
|
LLM_ARCH_SMOLLM3,
|
||||||
LLM_ARCH_LFM2,
|
LLM_ARCH_LFM2,
|
||||||
LLM_ARCH_DREAM,
|
LLM_ARCH_DREAM,
|
||||||
LLM_ARCH_SMALLTHINKER,
|
LLM_ARCH_SMALLTHINKER,
|
||||||
|
LLM_ARCH_LLADA,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||||
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
||||||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||||
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -193,6 +194,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_DOTS1;
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||||
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||||
|
} else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
||||||
}
|
}
|
||||||
|
@ -698,11 +701,27 @@ int32_t llm_chat_apply_template(
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
|
||||||
} else if (role == "assistant") {
|
} else if (role == "assistant") {
|
||||||
ss << "<|startoftext|>" << message->content << "<|eos|>";
|
ss << message->content << "<|eos|>";
|
||||||
} else {
|
} else {
|
||||||
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
|
||||||
|
// tencent/Hunyuan-4B-Instruct
|
||||||
|
for (size_t i = 0; i < chat.size(); i++) {
|
||||||
|
std::string role(chat[i]->role);
|
||||||
|
if (i == 0) {
|
||||||
|
if (role == "system") {
|
||||||
|
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (role == "assistant") {
|
||||||
|
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||||
// moonshotai/Kimi-K2-Instruct
|
// moonshotai/Kimi-K2-Instruct
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
|
|
@ -46,6 +46,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||||
LLM_CHAT_TEMPLATE_DOTS1,
|
LLM_CHAT_TEMPLATE_DOTS1,
|
||||||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||||
|
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
|
@ -113,6 +113,15 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||||
|
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||||
|
|
||||||
|
if (graph_reuse_disable) {
|
||||||
|
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||||
|
@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
||||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||||
|
|
||||||
if (res->can_reuse(gparams)) {
|
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||||
|
|
||||||
n_reused++;
|
n_reused++;
|
||||||
|
|
|
@ -291,6 +291,9 @@ private:
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||||
bool supports_set_rows = false;
|
bool supports_set_rows = false;
|
||||||
|
|
||||||
|
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||||
|
bool graph_reuse_disable = false;
|
||||||
|
|
||||||
// perf
|
// perf
|
||||||
mutable int64_t t_start_us = 0;
|
mutable int64_t t_start_us = 0;
|
||||||
mutable int64_t t_load_us = 0;
|
mutable int64_t t_load_us = 0;
|
||||||
|
|
|
@ -785,13 +785,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
bool scale_w,
|
bool scale_w,
|
||||||
float w_scale,
|
float w_scale,
|
||||||
llama_expert_gating_func_type gating_op,
|
llama_expert_gating_func_type gating_op,
|
||||||
int il) const {
|
int il,
|
||||||
|
ggml_tensor * probs_in) const {
|
||||||
const int64_t n_embd = cur->ne[0];
|
const int64_t n_embd = cur->ne[0];
|
||||||
const int64_t n_tokens = cur->ne[1];
|
const int64_t n_tokens = cur->ne[1];
|
||||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||||
|
|
||||||
ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
ggml_tensor * logits = nullptr;
|
||||||
|
|
||||||
|
if (probs_in == nullptr) {
|
||||||
|
logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
||||||
cb(logits, "ffn_moe_logits", il);
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
} else {
|
||||||
|
logits = probs_in;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * probs = nullptr;
|
ggml_tensor * probs = nullptr;
|
||||||
switch (gating_op) {
|
switch (gating_op) {
|
||||||
|
@ -884,6 +891,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
cur = ggml_gelu(ctx0, cur);
|
cur = ggml_gelu(ctx0, cur);
|
||||||
cb(cur, "ffn_moe_gelu", il);
|
cb(cur, "ffn_moe_gelu", il);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_FFN_RELU:
|
||||||
|
if (gate_exps) {
|
||||||
|
cur = ggml_reglu_split(ctx0, cur, up);
|
||||||
|
cb(cur, "ffn_moe_reglu", il);
|
||||||
|
} else {
|
||||||
|
cur = ggml_relu(ctx0, cur);
|
||||||
|
cb(cur, "ffn_moe_relu", il);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
@ -927,100 +942,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
return moe_out;
|
return moe_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_moe_ffn_from_probs(
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * probs,
|
|
||||||
ggml_tensor * up_exps,
|
|
||||||
ggml_tensor * gate_exps,
|
|
||||||
ggml_tensor * down_exps,
|
|
||||||
ggml_tensor * exp_probs_b,
|
|
||||||
int64_t n_expert,
|
|
||||||
int64_t n_expert_used,
|
|
||||||
llama_expert_gating_func_type gating_op,
|
|
||||||
int il) const {
|
|
||||||
const int64_t n_embd = cur->ne[0];
|
|
||||||
const int64_t n_tokens = cur->ne[1];
|
|
||||||
|
|
||||||
// add experts selection bias - introduced in DeepSeek V3
|
|
||||||
// leave probs unbiased as it's later used to get expert weights
|
|
||||||
ggml_tensor * selection_probs = probs;
|
|
||||||
if (exp_probs_b != nullptr) {
|
|
||||||
selection_probs = ggml_add(ctx0, probs, exp_probs_b);
|
|
||||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
// select experts
|
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
||||||
cb(selected_experts, "ffn_moe_topk", il);
|
|
||||||
|
|
||||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
||||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
|
|
||||||
cb(weights, "ffn_moe_weights", il);
|
|
||||||
|
|
||||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
|
|
||||||
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX) {
|
|
||||||
weights = ggml_soft_max(ctx0, weights);
|
|
||||||
} else {
|
|
||||||
weights = ggml_sigmoid(ctx0, weights);
|
|
||||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
||||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
||||||
|
|
||||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
||||||
cb(weights, "ffn_moe_weights_norm", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
|
||||||
|
|
||||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
|
||||||
|
|
||||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
||||||
cb(up, "ffn_moe_up", il);
|
|
||||||
|
|
||||||
ggml_tensor * experts = nullptr;
|
|
||||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
|
||||||
cb(cur, "ffn_moe_gate", il);
|
|
||||||
|
|
||||||
cur = ggml_reglu_split(ctx0, cur, up);
|
|
||||||
cb(cur, "ffn_moe_reglu", il);
|
|
||||||
|
|
||||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
||||||
cb(experts, "ffn_moe_down", il);
|
|
||||||
|
|
||||||
experts = ggml_mul(ctx0, experts, weights);
|
|
||||||
cb(cur, "ffn_moe_weighted", il);
|
|
||||||
|
|
||||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
|
||||||
|
|
||||||
assert(n_expert_used > 0);
|
|
||||||
|
|
||||||
// order the views before the adds
|
|
||||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
|
||||||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur_experts[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// aggregate experts
|
|
||||||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
|
||||||
// to avoid potentially a large number of add nodes during warmup
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
|
||||||
ggml_tensor * moe_out = cur_experts[0];
|
|
||||||
|
|
||||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
|
||||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_expert_used == 1) {
|
|
||||||
// avoid returning a non-contiguous tensor
|
|
||||||
moe_out = ggml_cont(ctx0, moe_out);
|
|
||||||
}
|
|
||||||
|
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
|
||||||
|
|
||||||
return moe_out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// input embeddings with optional lora
|
// input embeddings with optional lora
|
||||||
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
@ -1644,16 +1565,17 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_rs(
|
ggml_tensor * llm_graph_context::build_rs(
|
||||||
ggml_tensor * s,
|
ggml_tensor * s,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy_main,
|
||||||
|
ggml_tensor * state_copy_extra,
|
||||||
int32_t state_size,
|
int32_t state_size,
|
||||||
int32_t n_seqs,
|
int32_t n_seqs,
|
||||||
uint32_t n_kv,
|
uint32_t n_rs,
|
||||||
uint32_t kv_head,
|
uint32_t rs_head,
|
||||||
uint32_t kv_size,
|
uint32_t rs_size,
|
||||||
int32_t rs_zero,
|
int32_t rs_zero,
|
||||||
const llm_graph_get_rows_fn & get_state_rows) const {
|
const llm_graph_get_rows_fn & get_state_rows) const {
|
||||||
|
|
||||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size);
|
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
|
||||||
|
|
||||||
// Clear a single state which will then be copied to the other cleared states.
|
// Clear a single state which will then be copied to the other cleared states.
|
||||||
// Note that this is a no-op when the view is zero-sized.
|
// Note that this is a no-op when the view is zero-sized.
|
||||||
|
@ -1661,39 +1583,44 @@ ggml_tensor * llm_graph_context::build_rs(
|
||||||
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
||||||
|
|
||||||
// copy states
|
// copy states
|
||||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
// NOTE: assuming the copy destinations are ALL contained between rs_head and rs_head + n_rs
|
||||||
// {state_size, kv_size} -> {state_size, n_seqs}
|
// {state_size, rs_size} -> {state_size, n_seqs}
|
||||||
ggml_tensor * output_states = get_state_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
|
ggml_tensor * output_states = get_state_rows(ctx0, states, state_copy_main);
|
||||||
ggml_build_forward_expand(gf, output_states);
|
ggml_build_forward_expand(gf, output_states);
|
||||||
|
|
||||||
// copy extra states which won't be changed further (between n_seqs and n_kv)
|
// copy extra states which won't be changed further (between n_seqs and n_rs)
|
||||||
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
|
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, state_copy_extra);
|
||||||
ggml_build_forward_expand(gf,
|
ggml_build_forward_expand(gf,
|
||||||
ggml_cpy(ctx0,
|
ggml_cpy(ctx0,
|
||||||
states_extra,
|
states_extra,
|
||||||
ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
|
ggml_view_1d(ctx0, s, state_size*(n_rs - n_seqs), (rs_head + n_seqs)*state_size*ggml_element_size(s))));
|
||||||
|
|
||||||
return output_states;
|
return output_states;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
const llama_memory_recurrent_context * mctx_cur) {
|
const llama_memory_recurrent_context * mctx_cur) {
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
|
||||||
|
|
||||||
const auto n_rs = mctx_cur->get_n_rs();
|
const int64_t n_rs = mctx_cur->get_n_rs();
|
||||||
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
|
|
||||||
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
|
inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
|
||||||
ggml_set_input(inp->s_copy);
|
ggml_set_input(inp->s_copy);
|
||||||
|
|
||||||
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
||||||
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
||||||
|
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
||||||
|
|
||||||
auto inp = build_rs_inp_impl(ctx0, mctx_cur);
|
auto inp = build_rs_inp_impl(ctx0, ubatch, mctx_cur);
|
||||||
|
|
||||||
return (llm_graph_input_rs *) res->add_input(std::move(inp));
|
return (llm_graph_input_rs *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
@ -1706,7 +1633,9 @@ ggml_tensor * llm_graph_context::build_rs(
|
||||||
const llm_graph_get_rows_fn & get_state_rows) const {
|
const llm_graph_get_rows_fn & get_state_rows) const {
|
||||||
const auto * kv_state = inp->mctx;
|
const auto * kv_state = inp->mctx;
|
||||||
|
|
||||||
return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
|
return build_rs(s, inp->s_copy_main, inp->s_copy_extra, state_size, n_seqs,
|
||||||
|
kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(),
|
||||||
|
get_state_rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||||
|
@ -1753,7 +1682,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||||
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||||
|
|
||||||
auto inp_rs = build_rs_inp_impl(ctx0, mctx_cur->get_recr());
|
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
||||||
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
auto inp_attn = build_attn_inp_kv_unified_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||||
|
|
|
@ -214,7 +214,12 @@ public:
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [kv_size]
|
ggml_tensor * s_copy; // I32 [n_rs]
|
||||||
|
|
||||||
|
// views of s_copy, computed once per graph
|
||||||
|
// and shared across layers which use build_rs
|
||||||
|
ggml_tensor * s_copy_main; // I32 [n_seqs]
|
||||||
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||||
|
|
||||||
const llama_memory_recurrent_context * mctx;
|
const llama_memory_recurrent_context * mctx;
|
||||||
};
|
};
|
||||||
|
@ -418,7 +423,9 @@ struct llm_graph_params {
|
||||||
(!ubatch.embd && !other.ubatch.embd)
|
(!ubatch.embd && !other.ubatch.embd)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||||
|
// the reason is because the set of attention streams would be different for different sequences
|
||||||
|
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
||||||
if (!ubatch.data) {
|
if (!ubatch.data) {
|
||||||
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
||||||
// therefore we cannot perform the sequence id check. normally should never happen
|
// therefore we cannot perform the sequence id check. normally should never happen
|
||||||
|
@ -626,19 +633,8 @@ struct llm_graph_context {
|
||||||
bool scale_w,
|
bool scale_w,
|
||||||
float w_scale,
|
float w_scale,
|
||||||
llama_expert_gating_func_type gating_op,
|
llama_expert_gating_func_type gating_op,
|
||||||
int il) const;
|
int il,
|
||||||
|
ggml_tensor * probs_in = nullptr) const;
|
||||||
ggml_tensor * build_moe_ffn_from_probs(
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * probs,
|
|
||||||
ggml_tensor * up_exps,
|
|
||||||
ggml_tensor * gate_exps,
|
|
||||||
ggml_tensor * down_exps,
|
|
||||||
ggml_tensor * exp_probs_b,
|
|
||||||
int64_t n_expert,
|
|
||||||
int64_t n_expert_used,
|
|
||||||
llama_expert_gating_func_type gating_op,
|
|
||||||
int il) const;
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// inputs
|
// inputs
|
||||||
|
@ -730,7 +726,6 @@ struct llm_graph_context {
|
||||||
// recurrent
|
// recurrent
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: avoid notion of "kv"
|
|
||||||
// TODO: move this implementation to llama_memory_recurrent.
|
// TODO: move this implementation to llama_memory_recurrent.
|
||||||
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
||||||
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
||||||
|
@ -738,12 +733,13 @@ struct llm_graph_context {
|
||||||
// `llama_memory_recurrent`
|
// `llama_memory_recurrent`
|
||||||
ggml_tensor * build_rs(
|
ggml_tensor * build_rs(
|
||||||
ggml_tensor * s,
|
ggml_tensor * s,
|
||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy_main,
|
||||||
|
ggml_tensor * state_copy_extra,
|
||||||
int32_t state_size,
|
int32_t state_size,
|
||||||
int32_t n_seqs,
|
int32_t n_seqs,
|
||||||
uint32_t n_kv,
|
uint32_t n_rs,
|
||||||
uint32_t kv_head,
|
uint32_t rs_head,
|
||||||
uint32_t kv_size,
|
uint32_t rs_size,
|
||||||
int32_t rs_zero,
|
int32_t rs_zero,
|
||||||
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
||||||
|
|
||||||
|
|
|
@ -295,7 +295,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
||||||
}
|
}
|
||||||
|
|
||||||
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
||||||
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
|
||||||
buft_list_t buft_list;
|
buft_list_t buft_list;
|
||||||
|
|
||||||
// add ACCEL buffer types
|
// add ACCEL buffer types
|
||||||
|
@ -324,8 +324,8 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// add extra buffer types, only if no GPU device is present
|
// add extra buffer types
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
if (use_extra_bufts) {
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
if (cpu_dev == nullptr) {
|
if (cpu_dev == nullptr) {
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
@ -341,6 +341,7 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
||||||
++extra_bufts;
|
++extra_bufts;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// add the CPU buffer type
|
// add the CPU buffer type
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
|
@ -874,6 +875,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.causal_attn = false;
|
hparams.causal_attn = false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
// LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32:
|
||||||
|
type = LLM_TYPE_8B;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
// Set non-causal attention for diffusion models
|
||||||
|
hparams.causal_attn = false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||||
|
@ -1749,6 +1765,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 1024: type = LLM_TYPE_0_5B; break;
|
||||||
|
case 2048: type = LLM_TYPE_1_8B; break;
|
||||||
|
case 3072: type = LLM_TYPE_4B; break;
|
||||||
|
case 4096: type = LLM_TYPE_7B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
@ -1829,7 +1857,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
|
||||||
|
|
||||||
// build a list of buffer types for the CPU and GPU devices
|
// build a list of buffer types for the CPU and GPU devices
|
||||||
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
|
||||||
for (auto * dev : devices) {
|
for (auto * dev : devices) {
|
||||||
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
||||||
// add CPU buffer types as a fallback
|
// add CPU buffer types as a fallback
|
||||||
|
@ -2045,7 +2073,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||||
std::regex pattern(overrides->pattern);
|
std::regex pattern(overrides->pattern);
|
||||||
if (std::regex_search(tensor_name, pattern)) {
|
if (std::regex_search(tensor_name, pattern)) {
|
||||||
|
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||||
|
// when overriding to a CPU buffer, consider the extra buffer types
|
||||||
|
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
||||||
|
} else {
|
||||||
buft = overrides->buft;
|
buft = overrides->buft;
|
||||||
|
}
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||||
tensor_name.c_str(),
|
tensor_name.c_str(),
|
||||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||||
|
@ -2207,6 +2241,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output =
|
||||||
|
create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||||
|
|
||||||
|
// Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
||||||
|
layer.wq =
|
||||||
|
create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
||||||
|
// No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
|
||||||
|
layer.wo =
|
||||||
|
create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||||
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||||
|
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
|
||||||
|
TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
||||||
|
|
||||||
|
// optional MLP bias
|
||||||
|
layer.ffn_gate_b =
|
||||||
|
create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_down_b =
|
||||||
|
create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||||
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case LLM_ARCH_LLAMA4:
|
case LLM_ARCH_LLAMA4:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
@ -5222,6 +5303,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
@ -8142,6 +8256,106 @@ struct llm_build_dream : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_llada : public llm_graph_context {
|
||||||
|
llm_build_llada(const llama_model & model, const llm_graph_params & params) :
|
||||||
|
llm_graph_context(params) {
|
||||||
|
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// Non-causal attention for diffusion
|
||||||
|
auto * inp_attn = build_attn_inp_no_cache();
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
|
||||||
|
1.0f / sqrtf(float(n_embd_head)), il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_qwen2vl : public llm_graph_context {
|
struct llm_build_qwen2vl : public llm_graph_context {
|
||||||
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
@ -16861,6 +17075,144 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_hunyuan_dense : public llm_graph_context {
|
||||||
|
llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur,
|
||||||
|
model.layers[il].attn_k_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(Kcur, "Kcur_norm", il);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur,
|
||||||
|
model.layers[il].attn_q_norm, nullptr,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur_norm", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
// feed-forward network (non-MoE)
|
||||||
|
ggml_tensor * cur_mlp = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur_mlp, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur_mlp, ffn_inp);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_smollm3 : public llm_graph_context {
|
struct llm_build_smollm3 : public llm_graph_context {
|
||||||
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
@ -17258,10 +17610,18 @@ struct llm_build_smallthinker : public llm_graph_context{
|
||||||
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
|
ggml_tensor * ffn_out =
|
||||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
build_moe_ffn(cur,
|
||||||
nullptr, n_expert, n_expert_used,
|
nullptr,
|
||||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
|
model.layers[il].ffn_up_exps,
|
||||||
|
model.layers[il].ffn_gate_exps,
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_RELU, true,
|
||||||
|
false, 0.0,
|
||||||
|
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
||||||
|
il, probs);
|
||||||
|
|
||||||
cb(ffn_out, "ffn_out", il);
|
cb(ffn_out, "ffn_out", il);
|
||||||
cur = ffn_out;
|
cur = ffn_out;
|
||||||
|
@ -17301,6 +17661,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
case LLM_ARCH_NEO_BERT:
|
case LLM_ARCH_NEO_BERT:
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
case LLM_ARCH_DREAM:
|
case LLM_ARCH_DREAM:
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
{
|
{
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
} break;
|
} break;
|
||||||
|
@ -17467,6 +17828,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
llm = std::make_unique<llm_build_dream>(*this, params);
|
llm = std::make_unique<llm_build_dream>(*this, params);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_llada>(*this, params);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case LLM_ARCH_QWEN2VL:
|
case LLM_ARCH_QWEN2VL:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
||||||
|
@ -17714,6 +18080,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_SMOLLM3:
|
case LLM_ARCH_SMOLLM3:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
||||||
|
@ -17763,6 +18133,7 @@ llama_model_params llama_model_default_params() {
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
|
/*.use_extra_bufts =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -17865,6 +18236,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
|
case LLM_ARCH_LLADA:
|
||||||
case LLM_ARCH_LLAMA4:
|
case LLM_ARCH_LLAMA4:
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
@ -17931,6 +18303,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
case LLM_ARCH_DOTS1:
|
case LLM_ARCH_DOTS1:
|
||||||
case LLM_ARCH_HUNYUAN_MOE:
|
case LLM_ARCH_HUNYUAN_MOE:
|
||||||
|
case LLM_ARCH_HUNYUAN_DENSE:
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_SMALLTHINKER:
|
case LLM_ARCH_SMALLTHINKER:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
@ -18043,6 +18416,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
|
||||||
return llm_arch_is_recurrent(model->arch);
|
return llm_arch_is_recurrent(model->arch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_model_is_diffusion(const llama_model * model) {
|
||||||
|
return llm_arch_is_diffusion(model->arch);
|
||||||
|
}
|
||||||
|
|
||||||
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
|
||||||
return model->tensors_by_name;
|
return model->tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
|
@ -878,9 +878,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
// get more optimal quantization type based on the tensor shape, layer, etc.
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
||||||
if (!params->pure && ggml_is_quantized(default_type)) {
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
||||||
|
int fallback = qs.n_fallback;
|
||||||
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
||||||
// unless the user specifies a type
|
// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
|
||||||
if (params->tensor_types) {
|
if (params->tensor_types && qs.n_fallback - fallback == 0) {
|
||||||
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
||||||
const std::string tensor_name(tensor->name);
|
const std::string tensor_name(tensor->name);
|
||||||
for (const auto & [tname, qtype] : tensor_types) {
|
for (const auto & [tname, qtype] : tensor_types) {
|
||||||
|
@ -893,7 +894,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
||||||
new_type = params->token_embedding_type;
|
new_type = params->token_embedding_type;
|
||||||
}
|
}
|
||||||
|
|
|
@ -532,6 +532,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\p{N}{1,3}",
|
"\\p{N}{1,3}",
|
||||||
"[一-龥-ゟ゠-ヿ]+",
|
"[一-龥-ゟ゠-ヿ]+",
|
||||||
|
@ -2200,6 +2201,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
tokenizer_pre == "hunyuan") {
|
tokenizer_pre == "hunyuan") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "hunyuan-dense") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
|
||||||
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "kimi-k2") {
|
tokenizer_pre == "kimi-k2") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||||
|
|
|
@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV;
|
struct LLM_KV;
|
||||||
|
|
|
@ -893,10 +893,16 @@ struct clip_graph {
|
||||||
int n_head = n_embd/d_head;
|
int n_head = n_embd/d_head;
|
||||||
int num_query = 96;
|
int num_query = 96;
|
||||||
if (ctx->model.hparams.minicpmv_version == 2) {
|
if (ctx->model.hparams.minicpmv_version == 2) {
|
||||||
|
// MiniCPM-V 2.5
|
||||||
num_query = 96;
|
num_query = 96;
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
||||||
|
// MiniCPM-V 2.6
|
||||||
num_query = 64;
|
num_query = 64;
|
||||||
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
||||||
|
// MiniCPM-o 2.6
|
||||||
|
num_query = 64;
|
||||||
|
} else if (ctx->model.hparams.minicpmv_version == 5) {
|
||||||
|
// MiniCPM-V 4.0
|
||||||
num_query = 64;
|
num_query = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3727,10 +3733,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
{
|
{
|
||||||
if (params.minicpmv_version == 2) {
|
if (params.minicpmv_version == 2) {
|
||||||
|
// MiniCPM-V 2.5
|
||||||
n_patches_sq = 96;
|
n_patches_sq = 96;
|
||||||
} else if (params.minicpmv_version == 3) {
|
} else if (params.minicpmv_version == 3) {
|
||||||
|
// MiniCPM-V 2.6
|
||||||
n_patches_sq = 64;
|
n_patches_sq = 64;
|
||||||
} else if (params.minicpmv_version == 4) {
|
} else if (params.minicpmv_version == 4) {
|
||||||
|
// MiniCPM-o 2.6
|
||||||
|
n_patches_sq = 64;
|
||||||
|
} else if (params.minicpmv_version == 5) {
|
||||||
|
// MiniCPM-V 4.0
|
||||||
n_patches_sq = 64;
|
n_patches_sq = 64;
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
GGML_ABORT("Unknown minicpmv version");
|
||||||
|
@ -4459,11 +4471,17 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.mm_3_b->ne[0];
|
return ctx->model.mm_3_b->ne[0];
|
||||||
case PROJECTOR_TYPE_MINICPMV:
|
case PROJECTOR_TYPE_MINICPMV:
|
||||||
if (hparams.minicpmv_version == 2) {
|
if (hparams.minicpmv_version == 2) {
|
||||||
|
// MiniCPM-V 2.5
|
||||||
return 4096;
|
return 4096;
|
||||||
} else if (hparams.minicpmv_version == 3) {
|
} else if (hparams.minicpmv_version == 3) {
|
||||||
|
// MiniCPM-V 2.6
|
||||||
return 3584;
|
return 3584;
|
||||||
} else if (hparams.minicpmv_version == 4) {
|
} else if (hparams.minicpmv_version == 4) {
|
||||||
|
// MiniCPM-o 2.6
|
||||||
return 3584;
|
return 3584;
|
||||||
|
} else if (hparams.minicpmv_version == 5) {
|
||||||
|
// MiniCPM-V 4.0
|
||||||
|
return 2560;
|
||||||
}
|
}
|
||||||
GGML_ABORT("Unknown minicpmv version");
|
GGML_ABORT("Unknown minicpmv version");
|
||||||
case PROJECTOR_TYPE_GLM_EDGE:
|
case PROJECTOR_TYPE_GLM_EDGE:
|
||||||
|
|
|
@ -497,11 +497,11 @@ ap.add_argument("--projector-type", help="Type of projector. Possible values: ml
|
||||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||||
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
default_image_mean = [0.5, 0.5, 0.5]
|
||||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
default_image_std = [0.5, 0.5, 0.5]
|
||||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
|
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2)
|
||||||
|
|
||||||
# with proper
|
# with proper
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
@ -517,6 +517,17 @@ if args.use_f32:
|
||||||
# output in the same directory as the model if output_dir is None
|
# output in the same directory as the model if output_dir is None
|
||||||
dir_model = args.model_dir
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
# If minicpmv_projector is not specified but the default path exists, use the default path
|
||||||
|
if args.minicpmv_projector is None:
|
||||||
|
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
|
||||||
|
if os.path.isfile(default_projector_path):
|
||||||
|
args.minicpmv_projector = default_projector_path
|
||||||
|
print(f"Found default projector file: {default_projector_path}")
|
||||||
|
|
||||||
|
# If output_dir is not specified, use model_dir as the default value
|
||||||
|
if args.output_dir is None:
|
||||||
|
args.output_dir = dir_model
|
||||||
|
|
||||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||||
vocab = None
|
vocab = None
|
||||||
tokens = None
|
tokens = None
|
||||||
|
@ -546,18 +557,21 @@ if args.use_f32:
|
||||||
minicpmv_version = args.minicpmv_version
|
minicpmv_version = args.minicpmv_version
|
||||||
emb_dim = 4096
|
emb_dim = 4096
|
||||||
block_count = 26
|
block_count = 26
|
||||||
if minicpmv_version == 1:
|
if minicpmv_version == 1: # MiniCPM-V 2.0
|
||||||
emb_dim = 2304
|
emb_dim = 2304
|
||||||
block_count = 26
|
block_count = 26
|
||||||
elif minicpmv_version == 2:
|
elif minicpmv_version == 2: # MiniCPM-V 2.5
|
||||||
emb_dim = 4096
|
emb_dim = 4096
|
||||||
block_count = 27
|
block_count = 27
|
||||||
elif minicpmv_version == 3:
|
elif minicpmv_version == 3: # MiniCPM-V 2.6
|
||||||
emb_dim = 3584
|
emb_dim = 3584
|
||||||
block_count = 27
|
block_count = 27
|
||||||
elif minicpmv_version == 4:
|
elif minicpmv_version == 4: # MiniCPM-o 2.6
|
||||||
emb_dim = 3584
|
emb_dim = 3584
|
||||||
block_count = 27
|
block_count = 27
|
||||||
|
elif minicpmv_version == 5: # MiniCPM-V 4.0
|
||||||
|
emb_dim = 2560
|
||||||
|
block_count = 27
|
||||||
|
|
||||||
default_vision_config = {
|
default_vision_config = {
|
||||||
"hidden_size": 1152,
|
"hidden_size": 1152,
|
||||||
|
@ -577,6 +591,10 @@ if minicpmv_version == 3:
|
||||||
elif minicpmv_version == 4:
|
elif minicpmv_version == 4:
|
||||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||||
model = SiglipVisionTransformer(vision_config)
|
model = SiglipVisionTransformer(vision_config)
|
||||||
|
elif minicpmv_version == 5:
|
||||||
|
default_vision_config["model_type"] = "siglip_vision_model"
|
||||||
|
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||||
|
model = SiglipVisionTransformer(vision_config)
|
||||||
|
|
||||||
processor = None
|
processor = None
|
||||||
# if model.attn_pool is not None:
|
# if model.attn_pool is not None:
|
||||||
|
@ -603,7 +621,7 @@ elif args.vision_only:
|
||||||
else:
|
else:
|
||||||
fname_middle = ""
|
fname_middle = ""
|
||||||
|
|
||||||
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
output_dir = args.output_dir
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||||
|
|
|
@ -207,7 +207,7 @@ struct mtmd_context {
|
||||||
tok_row_end_trail = false; // no trailing end-of-row token
|
tok_row_end_trail = false; // no trailing end-of-row token
|
||||||
ov_img_first = true;
|
ov_img_first = true;
|
||||||
|
|
||||||
} else if (minicpmv_version == 3 || minicpmv_version == 4) {
|
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) {
|
||||||
// minicpmv 2.6 format:
|
// minicpmv 2.6 format:
|
||||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||||
|
|
|
@ -312,7 +312,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
|
||||||
int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
|
int64_t n_datasets = gguf_get_arr_n(ctx_gguf, dataset_idx);
|
||||||
imatrix_datasets.reserve(n_datasets);
|
imatrix_datasets.reserve(n_datasets);
|
||||||
for (int64_t i = 0; i < n_datasets; ++i) {
|
for (int64_t i = 0; i < n_datasets; ++i) {
|
||||||
imatrix_datasets.push_back(gguf_get_val_str(ctx_gguf, dataset_idx));
|
imatrix_datasets.push_back(gguf_get_arr_str(ctx_gguf, dataset_idx, i));
|
||||||
}
|
}
|
||||||
printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
|
printf("%s: imatrix datasets=['%s'", __func__, imatrix_datasets[0].c_str());
|
||||||
for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
|
for (size_t i = 1; i < imatrix_datasets.size(); ++i) {
|
||||||
|
|
|
@ -138,6 +138,9 @@ struct slot_params {
|
||||||
std::string oaicompat_cmpl_id;
|
std::string oaicompat_cmpl_id;
|
||||||
common_chat_syntax oaicompat_chat_syntax;
|
common_chat_syntax oaicompat_chat_syntax;
|
||||||
|
|
||||||
|
// Embeddings
|
||||||
|
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||||
|
|
||||||
json to_json() const {
|
json to_json() const {
|
||||||
std::vector<std::string> samplers;
|
std::vector<std::string> samplers;
|
||||||
samplers.reserve(sampling.samplers.size());
|
samplers.reserve(sampling.samplers.size());
|
||||||
|
@ -470,6 +473,33 @@ struct server_task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (logit_bias != data.end() && logit_bias->is_object()) {
|
||||||
|
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||||
|
for (const auto & el : logit_bias->items()) {
|
||||||
|
float bias;
|
||||||
|
const auto & key = el.key();
|
||||||
|
const auto & value = el.value();
|
||||||
|
if (value.is_number()) {
|
||||||
|
bias = value.get<float>();
|
||||||
|
} else if (value.is_boolean() && !value.get<bool>()) {
|
||||||
|
bias = -INFINITY;
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *end;
|
||||||
|
llama_token tok = strtol(key.c_str(), &end, 10);
|
||||||
|
if (*end == 0) {
|
||||||
|
if (tok >= 0 && tok < n_vocab) {
|
||||||
|
params.sampling.logit_bias.push_back({tok, bias});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
auto toks = common_tokenize(vocab, key, false);
|
||||||
|
for (auto tok : toks) {
|
||||||
|
params.sampling.logit_bias.push_back({tok, bias});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
|
params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
|
||||||
|
@ -1899,6 +1929,7 @@ struct server_context {
|
||||||
mtmd_context * mctx = nullptr;
|
mtmd_context * mctx = nullptr;
|
||||||
|
|
||||||
const llama_vocab * vocab = nullptr;
|
const llama_vocab * vocab = nullptr;
|
||||||
|
bool vocab_dft_compatible = true;
|
||||||
|
|
||||||
llama_model * model_dft = nullptr;
|
llama_model * model_dft = nullptr;
|
||||||
|
|
||||||
|
@ -1989,10 +2020,9 @@ struct server_context {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
|
||||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
if (!vocab_dft_compatible) {
|
||||||
|
SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
|
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
|
||||||
|
@ -2082,11 +2112,14 @@ struct server_context {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.spec = common_speculative_init(slot.ctx_dft);
|
slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
|
||||||
if (slot.spec == nullptr) {
|
if (slot.spec == nullptr) {
|
||||||
SRV_ERR("%s", "failed to create speculator\n");
|
SRV_ERR("%s", "failed to create speculator\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
for (auto &pair : params_base.speculative.replacements) {
|
||||||
|
common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
@ -2601,7 +2634,7 @@ struct server_context {
|
||||||
|
|
||||||
// normalize only when there is pooling
|
// normalize only when there is pooling
|
||||||
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
|
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||||
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
|
common_embd_normalize(embd, embd_res.data(), n_embd, slot.params.embd_normalize);
|
||||||
res->embedding.push_back(embd_res);
|
res->embedding.push_back(embd_res);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
@ -4614,6 +4647,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int embd_normalize = 2; // default to Euclidean/L2 norm
|
||||||
|
if (body.count("embd_normalize") != 0) {
|
||||||
|
embd_normalize = body.at("embd_normalize");
|
||||||
|
if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// create and queue the task
|
// create and queue the task
|
||||||
json responses = json::array();
|
json responses = json::array();
|
||||||
bool error = false;
|
bool error = false;
|
||||||
|
@ -4629,6 +4670,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// OAI-compat
|
// OAI-compat
|
||||||
task.params.oaicompat = oaicompat;
|
task.params.oaicompat = oaicompat;
|
||||||
|
task.params.embd_normalize = embd_normalize;
|
||||||
|
|
||||||
tasks.push_back(std::move(task));
|
tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
|
|
|
@ -351,3 +351,32 @@ def test_logprobs_stream():
|
||||||
assert token.top_logprobs is not None
|
assert token.top_logprobs is not None
|
||||||
assert len(token.top_logprobs) > 0
|
assert len(token.top_logprobs) > 0
|
||||||
assert aggregated_text == output_text
|
assert aggregated_text == output_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_logit_bias():
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
|
||||||
|
exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
|
||||||
|
|
||||||
|
res = server.make_request("POST", "/tokenize", data={
|
||||||
|
"content": " " + " ".join(exclude) + " ",
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
tokens = res.body["tokens"]
|
||||||
|
logit_bias = {tok: -100 for tok in tokens}
|
||||||
|
|
||||||
|
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||||
|
res = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo-instruct",
|
||||||
|
temperature=0.0,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "Book"},
|
||||||
|
{"role": "user", "content": "What is the best book"},
|
||||||
|
],
|
||||||
|
max_tokens=64,
|
||||||
|
logit_bias=logit_bias
|
||||||
|
)
|
||||||
|
output_text = res.choices[0].message.content
|
||||||
|
assert output_text
|
||||||
|
assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
|
||||||
|
|
|
@ -444,6 +444,39 @@ def test_n_probs_post_sampling():
|
||||||
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
|
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
|
||||||
|
def test_logit_bias(tokenize, openai_style):
|
||||||
|
global server
|
||||||
|
server.start()
|
||||||
|
|
||||||
|
exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
|
||||||
|
|
||||||
|
logit_bias = []
|
||||||
|
if tokenize:
|
||||||
|
res = server.make_request("POST", "/tokenize", data={
|
||||||
|
"content": " " + " ".join(exclude) + " ",
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
tokens = res.body["tokens"]
|
||||||
|
logit_bias = [[tok, -100] for tok in tokens]
|
||||||
|
|
||||||
|
else:
|
||||||
|
logit_bias = [[" " + tok + " ", -100] for tok in exclude]
|
||||||
|
|
||||||
|
if openai_style:
|
||||||
|
logit_bias = {el[0]: -100 for el in logit_bias}
|
||||||
|
|
||||||
|
res = server.make_request("POST", "/completion", data={
|
||||||
|
"n_predict": 64,
|
||||||
|
"prompt": "What is the best book",
|
||||||
|
"logit_bias": logit_bias,
|
||||||
|
"temperature": 0.0
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
output_text = res.body["content"]
|
||||||
|
assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
|
||||||
|
|
||||||
|
|
||||||
def test_cancel_request():
|
def test_cancel_request():
|
||||||
global server
|
global server
|
||||||
server.n_ctx = 4096
|
server.n_ctx = 4096
|
||||||
|
|
12
vendor/minja/chat-template.hpp
vendored
12
vendor/minja/chat-template.hpp
vendored
|
@ -162,10 +162,15 @@ class chat_template {
|
||||||
}), false);
|
}), false);
|
||||||
caps_.supports_tools = contains(out, "some_tool");
|
caps_.supports_tools = contains(out, "some_tool");
|
||||||
|
|
||||||
|
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
||||||
|
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
||||||
|
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
||||||
|
|
||||||
|
json j_null;
|
||||||
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
auto make_tool_calls_msg = [&](const json & tool_calls) {
|
||||||
return json {
|
return json {
|
||||||
{"role", "assistant"},
|
{"role", "assistant"},
|
||||||
{"content", nullptr},
|
{"content", caps_.requires_non_null_content? "" : j_null},
|
||||||
{"tool_calls", tool_calls},
|
{"tool_calls", tool_calls},
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -195,9 +200,6 @@ class chat_template {
|
||||||
|
|
||||||
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
|
||||||
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;
|
||||||
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
|
|
||||||
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
|
|
||||||
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
|
|
||||||
|
|
||||||
if (caps_.supports_tool_calls) {
|
if (caps_.supports_tool_calls) {
|
||||||
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
auto dummy_args = caps_.requires_object_arguments ? dummy_args_obj : json(dummy_args_obj.dump());
|
||||||
|
@ -234,7 +236,7 @@ class chat_template {
|
||||||
};
|
};
|
||||||
const json tool_call_msg {
|
const json tool_call_msg {
|
||||||
{"role", "assistant"},
|
{"role", "assistant"},
|
||||||
{"content", nullptr},
|
{"content", caps_.requires_non_null_content ? "" : j_null},
|
||||||
{"tool_calls", json::array({
|
{"tool_calls", json::array({
|
||||||
{
|
{
|
||||||
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
// TODO: detect if requires numerical id or fixed length == 6 like Nemo
|
||||||
|
|
24
vendor/minja/minja.hpp
vendored
24
vendor/minja/minja.hpp
vendored
|
@ -1355,8 +1355,13 @@ public:
|
||||||
case Op::Gt: return l > r;
|
case Op::Gt: return l > r;
|
||||||
case Op::Le: return l <= r;
|
case Op::Le: return l <= r;
|
||||||
case Op::Ge: return l >= r;
|
case Op::Ge: return l >= r;
|
||||||
case Op::In: return (r.is_array() || r.is_object()) && r.contains(l);
|
case Op::In: return (((r.is_array() || r.is_object()) && r.contains(l)) ||
|
||||||
case Op::NotIn: return !(r.is_array() && r.contains(l));
|
(l.is_string() && r.is_string() &&
|
||||||
|
r.to_str().find(l.to_str()) != std::string::npos));
|
||||||
|
case Op::NotIn:
|
||||||
|
return !(((r.is_array() || r.is_object()) && r.contains(l)) ||
|
||||||
|
(l.is_string() && r.is_string() &&
|
||||||
|
r.to_str().find(l.to_str()) != std::string::npos));
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
throw std::runtime_error("Unknown binary operator");
|
throw std::runtime_error("Unknown binary operator");
|
||||||
|
@ -1552,6 +1557,19 @@ public:
|
||||||
else res[i] = std::tolower(res[i]);
|
else res[i] = std::tolower(res[i]);
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
} else if (method->get_name() == "replace") {
|
||||||
|
vargs.expectArgs("replace method", {2, 3}, {0, 0});
|
||||||
|
auto before = vargs.args[0].get<std::string>();
|
||||||
|
auto after = vargs.args[1].get<std::string>();
|
||||||
|
auto count = vargs.args.size() == 3 ? vargs.args[2].get<int64_t>()
|
||||||
|
: str.length();
|
||||||
|
size_t start_pos = 0;
|
||||||
|
while ((start_pos = str.find(before, start_pos)) != std::string::npos &&
|
||||||
|
count-- > 0) {
|
||||||
|
str.replace(start_pos, before.length(), after);
|
||||||
|
start_pos += after.length();
|
||||||
|
}
|
||||||
|
return str;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
throw std::runtime_error("Unknown method: " + method->get_name());
|
throw std::runtime_error("Unknown method: " + method->get_name());
|
||||||
|
@ -2128,7 +2146,7 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((has_first_colon || has_second_colon) && (start || end || step)) {
|
if ((has_first_colon || has_second_colon)) {
|
||||||
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
||||||
} else {
|
} else {
|
||||||
index = std::move(start);
|
index = std::move(start);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue