Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/nix/package.nix
#	.devops/tools.sh
#	.github/workflows/build.yml
#	Makefile
#	README.md
#	common/CMakeLists.txt
#	common/common.h
#	examples/llava/CMakeLists.txt
#	examples/run/CMakeLists.txt
#	examples/run/README.md
#	examples/run/run.cpp
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	ggml/src/ggml-kompute/ggml-kompute.cpp
#	tests/test-backend-ops.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-12-15 15:30:10 +08:00
commit f456ed7237
38 changed files with 10752 additions and 105 deletions

View file

@ -1078,12 +1078,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;

View file

@ -433,6 +433,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts;
}
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);

View file

@ -2001,6 +2001,29 @@ class Qwen2Model(Model):
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
mrope_section += [0] * max(0, 4 - len(mrope_section))
self.gguf_writer.add_rope_dimension_sections(mrope_section)
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for name, data in super().get_tensors():
if name.startswith("visual."):
continue
yield name, data
@Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2MOE

View file

@ -105,7 +105,9 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
@ -132,7 +134,8 @@ static std::string format(const char * fmt, ...) {
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
@ -166,6 +169,7 @@ enum projector_type {
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_UNKNOWN,
};
@ -174,6 +178,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
};
@ -466,7 +471,8 @@ struct clip_vision_model {
// embeddings
struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_embeddings_0;
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings;
@ -556,6 +562,7 @@ struct clip_ctx {
bool has_vision_encoder = false;
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
bool has_qwen2vl_merger = false;
int minicpmv_version = 2;
struct clip_vision_model vision_model;
@ -564,6 +571,7 @@ struct clip_ctx {
float image_mean[3];
float image_std[3];
bool use_gelu = false;
bool use_silu = false;
int32_t ftype = 1;
bool has_class_embedding = true;
@ -609,14 +617,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
image_size_height = imgs->data->ny;
}
}
else if (ctx->has_qwen2vl_merger) {
// use the image's native resolution when image is avaible
if (is_inf) {
// if (imgs->data->nx && imgs->data->ny) {
image_size_width = imgs->data->nx;
image_size_height = imgs->data->ny;
}
}
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
int n_layer = hparams.n_layer;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
const int batch_size = imgs->size;
@ -637,10 +657,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_qwen2vl_merger) {
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, patches_h, batch_size);
inp = ggml_reshape_4d(
ctx0, inp,
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
inp = ggml_reshape_3d(
ctx0, inp,
hidden_size, patches_w * patches_h, batch_size);
}
else {
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
}
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
@ -662,12 +702,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
}
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
}
if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size;
@ -691,7 +733,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
// loop over layers
if (ctx->has_minicpmv_projector) {
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
// TODO: figure out why we doing thing in this way ???
n_layer += 1;
}
for (int il = 0; il < n_layer - 1; il++) {
@ -713,8 +756,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * Q =
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
Q = ggml_rope_multi(
ctx0, Q, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
@ -722,6 +770,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
if (ctx->has_qwen2vl_merger) {
K = ggml_rope_multi(
ctx0, K, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
}
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
@ -761,6 +814,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->use_gelu) {
cur = ggml_gelu_inplace(ctx0, cur);
} else if (ctx->use_silu) {
cur = ggml_silu_inplace(ctx0, cur);
} else {
cur = ggml_gelu_quick_inplace(ctx0, cur);
}
@ -772,6 +827,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
cur = ggml_add(ctx0, embeddings, cur);
embeddings = cur;
}
// post-layernorm
@ -1033,6 +1089,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ASSERT(false);
}
}
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
}
// build the graph
ggml_build_forward_expand(gf, embeddings);
@ -1209,6 +1278,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
if (idx != -1) {
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder);
@ -1217,6 +1290,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
idx = get_key_idx(ctx, KEY_USE_GELU);
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
try {
idx = get_key_idx(ctx, KEY_USE_SILU);
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
} catch (std::runtime_error & /*e*/) {
new_clip->use_silu = false;
}
if (verbosity >= 1) {
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
@ -1392,11 +1472,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) {
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
}
try {
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
} catch(const std::exception& /*e*/) {
new_clip->has_qwen2vl_merger = false;
}
// LLaVA projection
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
@ -1484,6 +1569,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
}
else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -1522,6 +1613,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch;
batch.size = 1;
batch.data = nullptr;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
@ -1535,6 +1627,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
ctx_clip->load_image_size = load_image_size;
}
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
return ctx_clip->load_image_size;
}
struct clip_image_size * clip_image_size_init() {
struct clip_image_size * load_image_size = new struct clip_image_size();
load_image_size->width = 448;
@ -2075,6 +2171,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
}
return true;
}
else if (ctx->has_qwen2vl_merger) {
clip_image_u8 * resized = clip_image_u8_init();
auto patch_size = clip_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, *resized, nx, ny);
res_imgs->data = new clip_image_f32[1];
// clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
// res_imgs->data[0] = *res;
res_imgs->size = 1;
// clip_image_f32_free(res);
clip_image_u8_free(resized);
return true;
}
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
@ -2264,6 +2377,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_image_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_size;
}
@ -2285,6 +2405,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
}
int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
}
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@ -2298,6 +2425,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
}
return n_patches;
@ -2426,7 +2558,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
image_size_width = imgs->data[0].nx;
image_size_height = imgs->data[0].ny;
}
@ -2446,7 +2578,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny;
if (!ctx->has_minicpmv_projector) {
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
GGML_ASSERT(nx == image_size && ny == image_size);
}
@ -2504,9 +2636,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i<pos_w * pos_h;++i){
for(int j=0;j<embed_dim;++j){
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
for(int i=0;i < pos_w * pos_h; ++i){
for(int j=0; j < embed_dim; ++j){
pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
}
}
@ -2526,7 +2658,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
}
{
if (ctx->has_qwen2vl_merger) {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int ptr = 0;
for (int y = 0; y < ph; y+=2)
{
for (int x = 0; x < pw; x+=2)
{
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions_data[ptr] = y + dy;
positions_data[num_patches + ptr] = x + dx;
positions_data[num_patches * 2 + ptr] = y + dy;
positions_data[num_patches * 3 + ptr] = x + dx;
ptr++;
}
}
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
else {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
@ -2535,16 +2694,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
{
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
}
@ -2717,6 +2876,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return 3584;
}
}
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -2728,3 +2890,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
}
return 0;
}
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->has_qwen2vl_merger;
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
for (int i = 0; i < h*w*3; i++)
{
clip_img.buf[i] = img[i];
}
clip_img.nx = w;
clip_img.ny = h;
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}

View file

@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
#ifdef __cplusplus
}

View file

@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
if (clip_is_minicpmv(ctx_clip)) {
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
if (clip_is_qwen2vl(ctx_clip)) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
std::memcpy(
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
if (clip_is_minicpmv(ctx_clip)) {
num_max_patches = 10;
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
float * image_embd;
if (clip_is_qwen2vl(ctx_clip)) {
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
} else {
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
}
if (!image_embd) {
LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;

View file

@ -0,0 +1,158 @@
import argparse
from typing import Dict
import torch
import numpy as np
from gguf import *
from transformers import (
Qwen2VLForConditionalGeneration,
Qwen2VLProcessor,
AutoProcessor,
Qwen2VLConfig
)
VISION = "clip.vision"
def k(raw_key: str, arch: str) -> str:
return raw_key.format(arch=arch)
def to_gguf_name(name: str) -> str:
og = name
name = name.replace("text_model", "t").replace("vision_model", "v")
name = name.replace("blocks", "blk").replace("embeddings.", "")
name = name.replace("attn.", "attn_")
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
name = name.replace("merger.mlp", 'mm')
print(f"[to_gguf_name] {og} --> {name}")
return name
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
vision_model = qwen2vl.visual
tensor_map = {}
for name, ten in vision_model.state_dict().items():
ten = ten.numpy()
if 'qkv' in name:
if ten.ndim == 2: # weight
c3, _ = ten.shape
else: # bias
c3 = ten.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = ten[:c]
wk = ten[c: c * 2]
wv = ten[c * 2:]
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
elif 'merger' in name:
if name.endswith("ln_q.weight"):
tensor_map['v.post_ln.weight'] = ten
elif name.endswith("ln_q.bias"):
tensor_map['v.post_ln.bias'] = ten
else:
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
tensor_map[to_gguf_name(name)] = ten
elif 'patch_embed.proj.weight' in name:
# NOTE: split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = ten.shape
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
else:
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
for new_name, ten in tensor_map.items():
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
tensor_map[new_name] = ten.astype(np.float32)
else:
tensor_map[new_name] = ten.astype(dtype)
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
return tensor_map
def main(args):
if args.data_type == 'fp32':
dtype = torch.float32
np_dtype = np.float32
ftype = 0
elif args.data_type == 'fp16':
dtype = torch.float32
np_dtype = np.float16
ftype = 1
else:
raise ValueError()
model_name = args.model_name
print("model_name: ", model_name)
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype=dtype, device_map="cpu"
)
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
vcfg = cfg.vision_config
if os.path.isdir(model_name):
if model_name.endswith(os.sep):
model_name = model_name[:-1]
model_name = os.path.basename(model_name)
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
fout = GGUFWriter(path=fname_out, arch="clip")
fout.add_description("image encoder for Qwen2VL")
fout.add_file_type(ftype)
fout.add_bool("clip.has_text_encoder", False)
fout.add_bool("clip.has_vision_encoder", True)
fout.add_bool("clip.has_qwen2vl_merger", True)
fout.add_string("clip.projector_type", "qwen2vl_merger")
print(cfg.vision_config)
if 'silu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", True)
fout.add_bool("clip.use_gelu", False)
elif 'gelu' in cfg.vision_config.hidden_act.lower():
fout.add_bool("clip.use_silu", False)
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
else:
raise ValueError()
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
for name, data in tensor_map.items():
fout.add_tensor(name, data)
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
fout.add_name(model_name)
"""
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
"""
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
fout.write_header_to_file()
fout.write_kv_data_to_file()
fout.write_tensors_to_file()
fout.close()
print("save model as: ", fname_out)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,581 @@
#include "arg.h"
#include "base64.hpp"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef NDEBUG
#include "ggml-alloc.h"
#include "ggml-backend.h"
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
auto img_tokens = image_embed->n_image_pos;
// llama_pos mrope_pos[img_tokens * 4];
std::vector<llama_pos> mrope_pos;
mrope_pos.resize(img_tokens * 4);
for (int y = 0; y < ph; y++)
{
for (int x = 0; x < pw; x++)
{
int i = y * pw + x;
mrope_pos[i] = *st_pos_id;
mrope_pos[i + img_tokens] = *st_pos_id + y;
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
mrope_pos[i + img_tokens * 3] = 0;
}
}
*st_pos_id += std::max(pw, ph);
int processed = 0;
std::vector<llama_pos> batch_mrope_pos;
batch_mrope_pos.resize(img_tokens * 4);
for (int i = 0; i < img_tokens; i += n_batch) {
int n_eval = img_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
// llama_pos batch_mrope_pos[n_eval * 4];
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
llama_batch batch = {
int32_t(n_eval), // n_tokens
nullptr, // token
(image_embed->embed+i*n_embd), // embed
batch_mrope_pos.data(), // pos
nullptr, // n_seq_id
nullptr, // seq_id
nullptr, // logits
};
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
processed += n_eval;
}
return true;
}
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 4);
std::fill(pos.begin(), pos.end(), 0);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = *st_pos_id + (j % batch.n_tokens);
}
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
*st_pos_id += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
return true;
}
static const char * sample(struct common_sampler * smpl,
struct llama_context * ctx_llama,
int * n_past, int * st_pos_id) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true);
static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
ret = "</s>";
} else {
ret = common_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past, st_pos_id);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void print_usage(int, char ** argv) {
LOG("\n example usage:\n");
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
int n_past = 0;
int cur_pos_id = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<|vision_start|>");
if (image_pos != std::string::npos) {
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
// llava-1.5 native mode
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
if (params->verbose_prompt) {
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
if (image_embed != nullptr) {
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
// generate the response
LOG("\n");
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = "";
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
fflush(stdout);
}
common_sampler_free(smpl);
LOG("\n");
}
static struct llama_model * llava_init(common_params * params) {
llama_backend_init();
llama_numa_init(params->numa);
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
#ifndef NDEBUG
static void debug_test_mrope_2d() {
// 1. Initialize backend
ggml_backend_t backend = NULL;
std::string backend_name = "";
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
backend_name = "cuda";
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
backend = ggml_backend_cpu_init();
backend_name = "cpu";
}
// Calculate the size needed to allocate
size_t ctx_size = 0;
ctx_size += 2 * ggml_tensor_overhead(); // tensors
// no need to allocate anything else!
// 2. Allocate `ggml_context` to store tensor data
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
};
struct ggml_context * ctx = ggml_init(params);
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
ggml_set_name(pos, "pos");
ggml_set_input(pos);
std::vector<float> dummy_q;
dummy_q.resize(128 * 12 * 30);
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
std::vector<int> pos_id;
pos_id.resize(30 * 4);
for (int i = 0; i < 30; i ++) {
pos_id[i] = i;
pos_id[i + 30] = i + 10;
pos_id[i + 60] = i + 20;
pos_id[i + 90] = i + 30;
}
int sections[4] = {32, 32, 0, 0};
// 4. Allocate a `ggml_backend_buffer` to store all tensors
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// 5. Copy tensor data from main memory (RAM) to backend buffer
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
// 6. Create a `ggml_cgraph` for mul_mat operation
struct ggml_cgraph * gf = NULL;
struct ggml_context * ctx_cgraph = NULL;
// create a temporally context to build the graph
struct ggml_init_params params0 = {
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
ctx_cgraph = ggml_init(params0);
gf = ggml_new_graph(ctx_cgraph);
struct ggml_tensor * result0 = ggml_rope_multi(
ctx_cgraph, inp_raw, pos, nullptr,
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
0, 1, 32, 1);
// Add "result" tensor and all of its dependencies to the cgraph
ggml_build_forward_expand(gf, result0);
// 7. Create a `ggml_gallocr` for cgraph computation
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
ggml_gallocr_alloc_graph(allocr, gf);
// 9. Run the computation
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
ggml_backend_graph_compute(backend, gf);
// 10. Retrieve results (output tensors)
// in this example, output tensor is always the last tensor in the graph
struct ggml_tensor * result = result0;
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
float * result_data = (float *)malloc(ggml_nbytes(result));
// because the tensor data is stored in device buffer, we need to copy it back to RAM
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
std::ofstream outFile(bin_file, std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
outFile.close();
std::cout << "Data successfully written to " + bin_file << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
free(result_data);
// 11. Free memory and exit
ggml_free(ctx_cgraph);
ggml_gallocr_free(allocr);
ggml_free(ctx);
ggml_backend_buffer_free(buffer);
ggml_backend_free(backend);
}
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4;
float vals[56 * 56 * 3];
// float embd[ne];
std::vector<float> embd;
embd.resize(ne);
for (int i = 0; i < 56*56; i++)
{
for (int c = 0; c < 3; c++)
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
}
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
std::ofstream outFile("img_embed.bin", std::ios::binary);
if (outFile.is_open()) {
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
outFile.close();
std::cout << "Data successfully written to mrope.bin" << std::endl;
} else {
std::cerr << "Error opening file!" << std::endl;
}
}
#endif
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
return 1;
}
common_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
auto * model = llava_init(&params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#ifndef NDEBUG
} else if (params.image[0].empty()) {
auto ctx_llava = llava_init_context(&params, model);
debug_test_mrope_2d();
debug_dump_img_embed(ctx_llava);
llama_perf_context_print(ctx_llava->ctx_llama);
ctx_llava->model = NULL;
llava_free(ctx_llava);
#endif
} else {
for (auto & image : params.image) {
auto * ctx_llava = llava_init_context(&params, model);
auto * image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) {
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_perf_context_print(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
}
}
llama_free_model(model);
return 0;
}

View file

@ -459,7 +459,7 @@ struct server_task_result_cmpl_final : server_task_result {
int32_t n_decoded;
int32_t n_prompt_tokens;
int32_t n_tokens_cached;
int32_t has_new_line;
bool has_new_line;
std::string stopping_word;
stop_type stop = STOP_TYPE_NONE;
@ -1079,9 +1079,9 @@ struct server_slot {
SLT_INF(*this,
"\n"
"\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
"\r total time = %10.2f ms / %5d tokens\n",
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
" total time = %10.2f ms / %5d tokens\n",
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);

View file

@ -25,6 +25,7 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
assert res.body["timings"]["prompt_n"] == n_prompt
assert res.body["timings"]["predicted_n"] == n_predicted
assert res.body["truncated"] == truncated
assert type(res.body["has_new_line"]) == bool
assert match_regex(re_content, res.body["content"])
@ -48,6 +49,7 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
assert data["timings"]["predicted_n"] == n_predicted
assert data["truncated"] == truncated
assert data["stop_type"] == "limit"
assert type(data["has_new_line"]) == bool
assert "generation_settings" in data
assert server.n_predict is not None
assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)

View file

@ -22,7 +22,7 @@
#include <vector>
#include <memory>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
using json = nlohmann::ordered_json;

View file

@ -0,0 +1,26 @@
#ifndef GGML_OPENCL_H
#define GGML_OPENCL_H
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
//
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
#ifdef __cplusplus
}
#endif
#endif // GGML_OPENCL_H

View file

@ -237,7 +237,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGUF_MAGIC "GGUF"
@ -1449,6 +1451,22 @@ extern "C" {
float beta_fast,
float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,

View file

@ -46,6 +46,10 @@
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
@ -473,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (!fs::exists(search_path)) {
continue;
}
for (const auto & entry : fs::directory_iterator(search_path)) {
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
@ -538,6 +546,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
}

View file

@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) {
return false;
}
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true;
}
case GGML_OP_UPSCALE: {

View file

@ -9170,6 +9170,64 @@ static void ggml_rope_cache_init(
}
}
static void ggml_mrope_cache_init(
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta_t = theta_base_t;
float theta_h = theta_base_h;
float theta_w = theta_base_w;
float theta_e = theta_base_e; // extra position id for vision encoder
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
int sec_w = sections[1] + sections[0];
int sec_e = sections[2] + sec_w;
GGML_ASSERT(sect_dims <= ne0);
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
int sector = (i0 / 2) % sect_dims;
if (indep_sects) {
// compute theta independently for each dim sections
// (i.e. reset corresponding theta when `i0` go from one section to another)
if (sector == 0) {
theta_t = theta_base_t;
}
else if (sector == sections[0]) {
theta_h = theta_base_h;;
}
else if (sector == sec_w) {
theta_w = theta_base_w;
}
else if (sector == sec_e) {
theta_e = theta_base_e;
}
}
float theta = theta_t;
if (sector >= sections[0] && sector < sec_w) {
theta = theta_h;
}
else if (sector >= sec_w && sector < sec_w + sections[2]) {
theta = theta_w;
}
else if (sector >= sec_w + sections[2]) {
theta = theta_e;
}
rope_yarn(
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
);
cache[i0 + 1] *= sin_sign;
theta_t *= theta_scale;
theta_w *= theta_scale;
theta_h *= theta_scale;
theta_e *= theta_scale;
}
}
static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst,
@ -9180,6 +9238,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@ -9193,6 +9252,7 @@ static void ggml_compute_forward_rope_f32(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
GGML_TENSOR_UNARY_OP_LOCALS
@ -9225,6 +9285,16 @@ static void ggml_compute_forward_rope_f32(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}
const float * freq_factors = NULL;
if (src2 != NULL) {
@ -9240,18 +9310,63 @@ static void ggml_compute_forward_rope_f32(
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
if (!is_mrope) {
const int64_t p = pos[i2];
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
else {
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
for (int64_t i1 = 0; i1 < ne1; i1++) {
for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
if (ir++ < ir0) continue;
if (ir > ir1) break;
if (!is_neox) {
if (is_neox || is_mrope) {
if (is_vision){
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0];
const float x1 = src[n_dims];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
}
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
@ -9265,8 +9380,10 @@ static void ggml_compute_forward_rope_f32(
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[1] = x0*sin_theta + x1*cos_theta;
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
}
if (is_vision) {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
@ -9276,19 +9393,20 @@ static void ggml_compute_forward_rope_f32(
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
const float x1 = src[n_dims];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
}
}
} else {
// fill the remain channels with data from src tensor
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
dst_data[0] = src[0];
dst_data[1] = src[1];
}
}
}
}
@ -9306,6 +9424,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src2 = dst->src[2];
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
@ -9318,6 +9437,8 @@ static void ggml_compute_forward_rope_f16(
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
GGML_TENSOR_UNARY_OP_LOCALS
@ -9350,6 +9471,16 @@ static void ggml_compute_forward_rope_f16(
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}
const float * freq_factors = NULL;
if (src2 != NULL) {
@ -9367,16 +9498,61 @@ static void ggml_compute_forward_rope_f16(
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
if (!is_mrope) {
const int64_t p = pos[i2];
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
else {
const int64_t p_t = pos[i2];
const int64_t p_h = pos[i2 + ne2];
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
if (!is_neox) {
if (is_neox || is_mrope) {
if (is_vision) {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
const float cos_theta = cache[i0 + 0];
const float sin_theta = cache[i0 + 1];
@ -9390,8 +9566,10 @@ static void ggml_compute_forward_rope_f16(
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
} else {
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
}
if (is_vision) {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const int64_t ic = i0/2;
const float cos_theta = cache[i0 + 0];
@ -9401,19 +9579,19 @@ static void ggml_compute_forward_rope_f16(
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
}
}
} else {
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
dst_data[0] = src[0];
dst_data[1] = src[1];
dst_data[0] = src[0];
dst_data[1] = src[1];
}
}
}
}

View file

@ -4,6 +4,11 @@ struct rope_corr_dims {
float v[2];
};
struct mrope_sections {
int v[4];
};
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
@ -108,6 +113,105 @@ static __global__ void rope_neox(
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}
template<typename T, bool has_ff>
static __global__ void rope_multi(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
return;
}
const int row = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) {
const int i = row*ne0 + i0;
dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1];
return;
}
const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows;
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
int sec_w = sections.v[1] + sections.v[0];
int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < sections.v[0]) {
theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta;
float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims/2];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
}
template<typename T, bool has_ff>
static __global__ void rope_vision(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
return;
}
const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ne0 + i0/2;
const int i2 = row/p_delta_rows; // i2-th tokens
int sect_dims = sections.v[0] + sections.v[1];
int sec_w = sections.v[1] + sections.v[0];
int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (sector < sections.v[0]) {
const int p = sector;
theta_base = pos[i2]*powf(theta_scale, p);
}
else if (sector >= sections.v[0] && sector < sec_w) {
const int p = sector - sections.v[0];
theta_base = pos[i2 + ne2]*powf(theta_scale, p);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta;
float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
}
template<typename T>
static void rope_norm_cuda(
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
@ -156,6 +260,56 @@ static void rope_neox_cuda(
}
}
template<typename T>
static void rope_multi_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) {
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
} else {
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
}
}
template<typename T>
static void rope_vision_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nr, n_blocks_x, 1);
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) {
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
} else {
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, freq_factors, sections
);
}
}
static void rope_norm_cuda_f16(
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
static void rope_multi_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_multi_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne00 = src0->ne[0]; // head dims
const int64_t ne01 = src0->ne[1]; // num heads
const int64_t ne02 = src0->ne[2]; // num heads
const int64_t nr = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int mode = ((int32_t *) dst->op_params)[2];
//const int n_ctx = ((int32_t *) dst->op_params)[3];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
mrope_sections sections;
// RoPE alteration for extended context
float freq_base;
@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne00/2);
}
const int32_t * pos = (const int32_t *) src1_d;
@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
} else {
GGML_ABORT("fatal error");
}
} else if (is_mrope && !is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_multi_cuda_f32(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_multi_cuda_f16(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else {
GGML_ABORT("fatal error");
}
} else if (is_vision) {
if (src0->type == GGML_TYPE_F32) {
rope_vision_cuda_f32(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_vision_cuda_f16(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, freq_factors, sections, stream
);
} else {
GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32) {
rope_norm_cuda_f32(

View file

@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
case GGML_OP_ARGMAX:
case GGML_OP_NORM:
case GGML_OP_ROPE:
return true;
case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return true;
}
case GGML_OP_IM2COL:
return op->src[0]->type == GGML_TYPE_F16;
case GGML_OP_POOL_1D:
@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
} break;
case GGML_OP_ROPE:
{
GGML_ASSERT(ne10 == ne02);
// make sure we have one or more position id(ne10) per token(ne02)
GGML_ASSERT(ne10 % ne02 == 0);
GGML_ASSERT(ne10 >= ne02);
const int nth = MIN(1024, ne00);

View file

@ -0,0 +1,147 @@
find_package(OpenCL REQUIRED)
find_package(Python3 REQUIRED)
set(TARGET_NAME ggml-opencl)
ggml_add_backend_library(${TARGET_NAME}
ggml-opencl.cpp
../../include/ggml-opencl.h)
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
if (GGML_OPENCL_PROFILING)
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
add_compile_definitions(GGML_OPENCL_PROFILING)
endif ()
add_compile_definitions(GGML_OPENCL_SOA_Q)
if (GGML_OPENCL_USE_ADRENO_KERNELS)
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
endif ()
if (GGML_OPENCL_EMBED_KERNELS)
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
# Python must be accessible from command line
add_custom_command(
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
${OPENCL_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
${OPENCL_MM_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_mm.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
${OPENCL_CVT_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_cvt.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
)
target_sources(${TARGET_NAME} PRIVATE
${OPENCL_CL_SOURCE_EMBED}
${OPENCL_MM_CL_SOURCE_EMBED}
${OPENCL_CVT_CL_SOURCE_EMBED}
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
else ()
# copy ggml-opencl.cl to bin directory
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
endif ()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,26 @@
#
import sys
import logging
logger = logging.getLogger("opencl-embed-kernel")
def main():
logging.basicConfig(level=logging.INFO)
if len(sys.argv) != 3:
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
sys.exit(1)
ifile = open(sys.argv[1], "r")
ofile = open(sys.argv[2], "w")
for i in ifile:
ofile.write('R"({})"\n'.format(i))
ifile.close()
ofile.close()
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,106 @@
//------------------------------------------------------------------------------
// This file is contains additional kernels for data conversion.
// These kernels are used when loading the model, so its performance is less
// important.
//------------------------------------------------------------------------------
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#elif defined(cl_amd_fp16)
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
#else
#error "Half precision floating point not supportedby OpenCL implementation on your device."
#endif
#ifdef cl_khr_subgroups
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#elif defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#error "Subgroup not supported on your device."
#endif
#ifdef cl_intel_required_subgroup_size
// Always use subgroup size of 32 on Intel.
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
// Always use subgroups size of 64 on Adreno.
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#else
// TODO: do not know how to choose subgroup size on other GPUs.
#error "Selecting subgroup size is not supported on your device."
#endif
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define QR4_1 2
#define QK5_0 32
#define QR5_0 2
#define QK5_1 32
#define QR5_1 2
#define QK8_0 32
#define QR8_0 1
#define QK_K 256
#define K_QUANTS_PER_ITERATION 2
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
//------------------------------------------------------------------------------
// block_q4_0
//------------------------------------------------------------------------------
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
};
//------------------------------------------------------------------------------
// mul_vec_q_n_f32_flat_noshuffle
//
// This variation uses flat arrays (struct of arrays, SOA) representation for
// quant tensors. It also uses non shuffled bit order for weights.
//
// The shuffled version is kept in the original file because moving it here
// seems to result in worse performance for adreno.
//------------------------------------------------------------------------------
kernel void kernel_convert_block_q4_0_noshuffle(
global struct block_q4_0 * src0,
global uchar * dst_q,
global half * dst_d
) {
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
*d = b->d;
for (int i = 0; i < QK4_0/4; ++i) {
uchar x0 = b->qs[2*i + 0];
uchar x1 = b->qs[2*i + 1];
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
#ifdef ADRENO_GPU
// Workaround for adreno - must have the following printf statement for
// the kernel to work properly. Otherwise it produces incorrect result.
// convert_uchar above also seems necessary.
// Compare against a large number so that it does not print anything.
// get_sub_group_local_id() also works.
if (get_global_id(0) == 65536*4096) {
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
}
#endif
}
}

View file

@ -0,0 +1,265 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C (0)
uint K, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

View file

@ -0,0 +1,271 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C (0)
int ne00, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M / 2;
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,130 @@
// src0_q, src0_d, src1 are transposed as a preprocessing step
// 4-bit weights are transposed in groups of 4 (unsigned short int)
// consider weights originally "next to each other", now "on top of each other"
// each fiber computes a 8x4 tile of output elements
// using unshuffled weights
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
__attribute__((qcom_reqd_sub_group_size("full")))
kernel void kernel_mul_mat_Ab_Bi_8x4(
global const ushort * src0_q, // quantized A
global const half * src0_d, // A scales
__read_only image1d_buffer_t src1, // B (1d image)
global float * dst, // C
int m, // M
int n, // N with padding
int k, // K
int n_no_padding // N without padding
) {
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
half8 B; // registers for activations
half4 dequantized_weights; // registers for dequantized weights
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
for(int i=0; i<k; i+=4){ //loop through K dimension
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
// keep (i/4) and (i/32) in parenthesis, rounds down
// load 4 consecutive groups of 4 weights
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
// load 4 consecutive scales
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
// j=0
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=1
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=2
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=3
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
}
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
// conditional check if store is to a valid location. Required when N is not a multiple of 8
// if statements allow registers to be reused for each store
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View file

@ -0,0 +1,32 @@
// 16-bit transpose, loading/storing an 8x8 tile of elements
kernel void kernel_transpose_16(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_3 = i<<3;
const int j_3 = j<<3;
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
}

View file

@ -0,0 +1,25 @@
// 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View file

@ -0,0 +1,35 @@
// 32-bit transpose, loading/storing a 4x4 tile of elements
// Only used for activations
// converts to FP16
// also adds zero padding for non multiple of 8 prompt lengths
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
half4 temp1 = {0,0,0,0};
half4 temp2 = {0,0,0,0};
half4 temp3 = {0,0,0,0};
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
temp0 = read_imageh(input, (j_2+0)*cols+i);
}
if((j_2+1)*cols+i*4+3 < rows*cols*16){
temp1 = read_imageh(input, (j_2+1)*cols+i);
}
if((j_2+2)*cols+i*4+3 < rows*cols*16){
temp2 = read_imageh(input, (j_2+2)*cols+i);
}
if((j_2+3)*cols+i*4+3 < rows*cols*16){
temp3 = read_imageh(input, (j_2+3)*cols+i);
}
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View file

@ -4488,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return ggml_is_contiguous(op->src[0]);
}
case GGML_OP_IM2COL:
// TODO: add support for the new F32 operations
return op->src[0]->type == GGML_TYPE_F16;

View file

@ -7687,7 +7687,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_REPEAT:
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
{
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
return ggml_is_contiguous(op->src[0]);
}
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:

View file

@ -3530,15 +3530,18 @@ static struct ggml_tensor * ggml_rope_impl(
GGML_ASSERT(c->ne[0] >= n_dims / 2);
}
int sections[4] = {0, 0, 0, 0};
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(params + 11, &sections, sizeof(int)*4);
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
@ -3560,6 +3563,53 @@ struct ggml_tensor * ggml_rope(
);
}
struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
// Multimodal Rotary Position Embedding
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
if (c) {
GGML_ASSERT(c->type == GGML_TYPE_F32);
GGML_ASSERT(c->ne[0] >= n_dims / 2);
}
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(&params[11], sections, sizeof(int)*4);
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;
return result;
}
struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,

View file

@ -131,6 +131,7 @@ class Keys:
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
FREQ_BASE = "{arch}.rope.freq_base"
SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor"
@ -226,6 +227,7 @@ class MODEL_ARCH(IntEnum):
QWEN = auto()
QWEN2 = auto()
QWEN2MOE = auto()
QWEN2VL = auto()
PHI2 = auto()
PHI3 = auto()
PLAMO = auto()
@ -388,6 +390,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo",
@ -772,6 +775,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.QWEN2VL: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.QWEN2MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,

View file

@ -145,11 +145,10 @@ class GGUFReader:
count = int(count)
itemsize = int(np.empty([], dtype = dtype).itemsize)
end_offs = offset + itemsize * count
return (
self.data[offset:end_offs]
.view(dtype = dtype)[:count]
.newbyteorder(override_order or self.byte_order)
)
arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
if override_order is None:
return arr
return arr.view(arr.dtype.newbyteorder(override_order))
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
if field.name in self.fields:

View file

@ -751,6 +751,9 @@ class GGUFWriter:
def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
def add_rope_freq_base(self, value: float) -> None:
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)

View file

@ -108,9 +108,11 @@ extern "C" {
};
enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
};
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file

View file

@ -177,6 +177,7 @@ enum llm_arch {
LLM_ARCH_QWEN,
LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PLAMO,
@ -231,6 +232,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
@ -322,6 +324,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
@ -438,6 +441,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@ -912,6 +916,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2VL,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_QWEN2MOE,
{
@ -2492,11 +2513,12 @@ struct llama_hparams {
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul;
int rope_sections[4];
// for State Space Models
uint32_t ssm_d_conv = 0;
@ -2553,6 +2575,9 @@ struct llama_hparams {
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
if (std::equal(std::begin(this->rope_sections),
std::end(this->rope_sections),
std::begin(other.rope_sections))) return true;
if (this->ssm_d_conv != other.ssm_d_conv) return true;
if (this->ssm_d_inner != other.ssm_d_inner) return true;
@ -3396,6 +3421,11 @@ struct llama_context {
// whether we are computing encoder output or decoder output
bool is_encoding = false;
// TODO: find a better way to accommodate mutli-dimension position encoding methods
// number of position id each token get, 1 for each token in most cases.
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
int n_pos_per_token = 1;
// output of the encoder part of the encoder-decoder models
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
@ -5779,6 +5809,13 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2VL:
{
std::array<int, 4> section_dims;
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
}
// fall through
case LLM_ARCH_QWEN2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -8298,6 +8335,7 @@ static bool llm_load_tensors(
}
} break;
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2VL:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -12706,6 +12744,124 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
cb(lctx.inp_pos, "inp_pos", -1);
ggml_set_input(lctx.inp_pos);
struct ggml_tensor * inp_pos = lctx.inp_pos;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
Qcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_multi(
ctx0,
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// feed-forward network
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cur = lctx.cvec.apply_to(ctx0, cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@ -16821,6 +16977,11 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_qwen2();
} break;
case LLM_ARCH_QWEN2VL:
{
lctx.n_pos_per_token = 4;
result = llm.build_qwen2vl();
} break;
case LLM_ARCH_QWEN2MOE:
{
result = llm.build_qwen2moe();
@ -17039,8 +17200,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
if (ubatch.pos && lctx.inp_pos) {
const int64_t n_tokens = ubatch.n_tokens;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
auto n_pos = lctx.n_pos_per_token;
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
}
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@ -20177,6 +20338,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_MINICPM3:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");