mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Rewrite history to fix bad vulkan shader commits without increasing repo size
added dpe colab (+8 squashed commit) Squashed commit: [b8362da4] updated lite [ed6c037d] move nsigma into the regular sampler stack [ac5f61c6] relative filepath fixed [05fe96ab] export template [ed0a5a3e] nix_example.md: refactor (#1401) * nix_example.md: add override example * nix_example.md: drop graphics example, already basic nixos knowledge * nix_example.md: format * nix_example.md: Vulkan is disabled on macOS Disabled in:1ccd253acc
* nix_examples.md: nixpkgs.config.cuda{Arches -> Capabilities} Fixes: https://github.com/LostRuins/koboldcpp/issues/1367 [675c62f7] AutoGuess: Phi 4 (mini) (#1402) [4bf56982
] phrasing [b8c0df04
] Add Rep Pen to Top N Sigma sampler chain (#1397) - place after nsigma and before xtc (+3 squashed commit) Squashed commit: [87c52b97
] disable VMM from HIP [ee8906f3
] edit description [e85c0e69
] Remove Unnecessary Rep Counting (#1394) * stop counting reps * fix range-based initializer * strike that - reverse it
This commit is contained in:
parent
50eae1ffeb
commit
6b7d2349a7
114 changed files with 6666 additions and 2642 deletions
|
@ -43,6 +43,7 @@
|
|||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <cinttypes>
|
||||
|
@ -123,6 +124,7 @@ static std::string format(const char * fmt, ...) {
|
|||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
|
@ -447,8 +449,9 @@ struct clip_hparams {
|
|||
|
||||
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
||||
|
||||
int32_t image_grid_pinpoints[32];
|
||||
std::vector<int32_t> image_grid_pinpoints;
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
|
@ -588,6 +591,7 @@ struct clip_ctx {
|
|||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
int32_t max_feature_layer;
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
|
@ -654,7 +658,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
|
@ -755,13 +758,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
|
||||
std::vector<struct ggml_tensor *> embedding_stack;
|
||||
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
||||
n_layer += 1;
|
||||
}
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
for (int il = 0; il < ctx->max_feature_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(embeddings);
|
||||
}
|
||||
|
||||
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
|
||||
|
||||
// layernorm1
|
||||
|
@ -849,7 +858,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
||||
embeddings = cur;
|
||||
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
|
@ -860,6 +868,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
// final layer is a vision feature layer
|
||||
if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(embeddings);
|
||||
}
|
||||
|
||||
// If feature layers are explicitly set, stack them (if we have multiple)
|
||||
if (!embedding_stack.empty()) {
|
||||
embeddings = embedding_stack[0];
|
||||
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
||||
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
// llava projector
|
||||
if (ctx->has_llava_projector) {
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
@ -1455,14 +1476,26 @@ if(enable_gpu_clip)
|
|||
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
|
||||
int n = gguf_get_arr_n(ctx, idx);
|
||||
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
||||
for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
|
||||
hparams.image_grid_pinpoints[i] = pinpoints[i];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
|
||||
}
|
||||
if (n < 32)
|
||||
hparams.image_grid_pinpoints[n] = 0;
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
hparams.image_grid_pinpoints[0]=0;
|
||||
}
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
|
||||
// Load the vision feature layer indices if they are explicitly provided;
|
||||
// if multiple vision feature layers are present, the values will be concatenated
|
||||
// to form the final visual features.
|
||||
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
||||
// be non-negative, since we use -1 to mark values as unset here.
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
|
||||
int n = gguf_get_arr_n(ctx, idx);
|
||||
|
||||
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
hparams.vision_feature_layer.insert(vision_feature_layer[i]);
|
||||
}
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
||||
|
@ -1488,6 +1521,9 @@ if(enable_gpu_clip)
|
|||
new_clip->image_std[i] = std_data[i];
|
||||
}
|
||||
|
||||
// Calculate the deepest feature layer based on hparams and projector type
|
||||
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
|
||||
|
||||
if (verbosity >= 2) {
|
||||
LOG_INF("\n%s: vision model hparams\n", __func__);
|
||||
LOG_INF("image_size %d\n", hparams.image_size);
|
||||
|
@ -1501,8 +1537,13 @@ if(enable_gpu_clip)
|
|||
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
LOG_INF("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
||||
for (const auto & pp : hparams.image_grid_pinpoints) {
|
||||
LOG_INF("%d ", pp);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("v_vision_feature_layer: ");
|
||||
for (const auto & feature_layer: hparams.vision_feature_layer) {
|
||||
LOG_INF("%d ", feature_layer);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
|
@ -1741,11 +1782,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
|||
}
|
||||
}
|
||||
|
||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->buf.resize(3 * nx * ny);
|
||||
memcpy(img->buf.data(), data, img->buf.size());
|
||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||
}
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
|
@ -1755,7 +1796,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
}
|
||||
|
@ -1846,14 +1887,14 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||
uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height);
|
||||
if(letterboxed_image!=nullptr)
|
||||
{
|
||||
build_clip_img_from_data(letterboxed_image, new_width, new_height, img);
|
||||
clip_build_img_from_pixels(letterboxed_image, new_width, new_height, img);
|
||||
free(letterboxed_image);
|
||||
letterboxed_image = nullptr;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
}
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
|
@ -2334,10 +2375,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
}
|
||||
}
|
||||
} else {
|
||||
if (params.image_grid_pinpoints[0] != 0) {
|
||||
if (!params.image_grid_pinpoints.empty()) {
|
||||
// "spatial_unpad" with "anyres" processing for llava-1.6
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
||||
|
@ -2503,7 +2544,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints;
|
||||
if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
|
||||
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
|
@ -3038,6 +3086,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|||
return ctx->has_qwen2vl_merger;
|
||||
}
|
||||
|
||||
// Determine the number of encoder layers to iterate over
|
||||
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
|
||||
// Get the index of the second to last layer; this is the
|
||||
// default for models that have a llava projector
|
||||
const auto & hparams = ctx->vision_model.hparams;
|
||||
int n_layer = hparams.n_layer - 1;
|
||||
int deepest_feature_layer = -1;
|
||||
|
||||
// Handle other projectors; incrementing here indicates that we
|
||||
// should use the last encoder layer for the vision features.
|
||||
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
||||
n_layer += 1;
|
||||
}
|
||||
|
||||
// If we set explicit vision feature layers, only go up to the deepest one
|
||||
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||
if (feature_layer > deepest_feature_layer) {
|
||||
deepest_feature_layer = feature_layer;
|
||||
}
|
||||
}
|
||||
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
clip_image_f32 clip_img;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue