mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 18:30:50 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # Makefile # README.md # docs/backend/SYCL.md # examples/llava/README.md # examples/llava/clip.cpp # examples/run/run.cpp # examples/server/README.md # examples/sycl/run-llama2.sh # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-musa/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # tests/test-backend-ops.cpp
This commit is contained in:
commit
722fc2dbf1
79 changed files with 9330 additions and 5299 deletions
183
examples/llava/README-granitevision.md
Normal file
183
examples/llava/README-granitevision.md
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# Granite Vision
|
||||
|
||||
Download the model and point your `GRANITE_MODEL` environment variable to the path.
|
||||
|
||||
```bash
|
||||
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview
|
||||
$ export GRANITE_MODEL=./granite-vision-3.1-2b-preview
|
||||
```
|
||||
|
||||
|
||||
### 1. Running llava surgery v2.
|
||||
First, we need to run the llava surgery script as shown below:
|
||||
|
||||
`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
|
||||
|
||||
You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
|
||||
|
||||
```bash
|
||||
$ ls $GRANITE_MODEL | grep -i llava
|
||||
llava.clip
|
||||
llava.projector
|
||||
```
|
||||
|
||||
We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
|
||||
```python
|
||||
import os
|
||||
import torch
|
||||
|
||||
MODEL_PATH = os.getenv("GRANITE_MODEL")
|
||||
if not MODEL_PATH:
|
||||
raise ValueError("env var GRANITE_MODEL is unset!")
|
||||
|
||||
encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
|
||||
projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
|
||||
|
||||
assert len(encoder_tensors) > 0
|
||||
assert len(projector_tensors) > 0
|
||||
```
|
||||
|
||||
If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
|
||||
|
||||
|
||||
### 2. Creating the Visual Component GGUF
|
||||
To create the GGUF for the visual components, we need to write a config for the visual encoder; make sure the config contains the correct `image_grid_pinpoints`
|
||||
|
||||
|
||||
Note: we refer to this file as `$VISION_CONFIG` later on.
|
||||
```json
|
||||
{
|
||||
"_name_or_path": "siglip-model",
|
||||
"architectures": [
|
||||
"SiglipVisionModel"
|
||||
],
|
||||
"image_grid_pinpoints": [
|
||||
[384,768],
|
||||
[384,1152],
|
||||
[384,1536],
|
||||
[384,1920],
|
||||
[384,2304],
|
||||
[384,2688],
|
||||
[384,3072],
|
||||
[384,3456],
|
||||
[384,3840],
|
||||
[768,384],
|
||||
[768,768],
|
||||
[768,1152],
|
||||
[768,1536],
|
||||
[768,1920],
|
||||
[1152,384],
|
||||
[1152,768],
|
||||
[1152,1152],
|
||||
[1536,384],
|
||||
[1536,768],
|
||||
[1920,384],
|
||||
[1920,768],
|
||||
[2304,384],
|
||||
[2688,384],
|
||||
[3072,384],
|
||||
[3456,384],
|
||||
[3840,384]
|
||||
],
|
||||
"mm_patch_merge_type": "spatial_unpad",
|
||||
"hidden_size": 1152,
|
||||
"image_size": 384,
|
||||
"intermediate_size": 4304,
|
||||
"model_type": "siglip_vision_model",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 27,
|
||||
"patch_size": 14,
|
||||
"layer_norm_eps": 1e-6,
|
||||
"hidden_act": "gelu_pytorch_tanh",
|
||||
"projection_dim": 0,
|
||||
"vision_feature_layer": [-24, -20, -12, -1]
|
||||
}
|
||||
```
|
||||
|
||||
Create a new directory to hold the visual components, and copy the llava.clip/projector files, as well as the vision config into it.
|
||||
|
||||
```bash
|
||||
$ ENCODER_PATH=$PWD/visual_encoder
|
||||
$ mkdir $ENCODER_PATH
|
||||
|
||||
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
|
||||
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
|
||||
$ cp $VISION_CONFIG $ENCODER_PATH/config.json
|
||||
```
|
||||
|
||||
At which point you should have something like this:
|
||||
```bash
|
||||
$ ls $ENCODER_PATH
|
||||
config.json llava.projector pytorch_model.bin
|
||||
```
|
||||
|
||||
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the siglip visual encoder - in the transformers model, you can find these numbers in the [preprocessor_config.json](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview/blob/main/preprocessor_config.json).
|
||||
```bash
|
||||
$ python convert_image_encoder_to_gguf.py \
|
||||
-m $ENCODER_PATH \
|
||||
--llava-projector $ENCODER_PATH/llava.projector \
|
||||
--output-dir $ENCODER_PATH \
|
||||
--clip-model-is-vision \
|
||||
--clip-model-is-siglip \
|
||||
--image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
|
||||
```
|
||||
|
||||
this will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the abs path of this file as the `$VISUAL_GGUF_PATH.`
|
||||
|
||||
|
||||
### 3. Creating the LLM GGUF.
|
||||
The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
|
||||
|
||||
First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
|
||||
```
|
||||
$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
|
||||
```
|
||||
|
||||
```python
|
||||
import os
|
||||
import transformers
|
||||
|
||||
MODEL_PATH = os.getenv("GRANITE_MODEL")
|
||||
if not MODEL_PATH:
|
||||
raise ValueError("env var GRANITE_MODEL is unset!")
|
||||
|
||||
LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
|
||||
if not MODEL_PATH:
|
||||
raise ValueError("env var LLM_EXPORT_PATH is unset!")
|
||||
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
|
||||
|
||||
# NOTE: granite vision support was added to transformers very recently (4.49);
|
||||
# if you get size mismatches, your version is too old.
|
||||
# If you are running with an older version, set `ignore_mismatched_sizes=True`
|
||||
# as shown below; it won't be loaded correctly, but the LLM part of the model that
|
||||
# we are exporting will be loaded correctly.
|
||||
model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
|
||||
|
||||
tokenizer.save_pretrained(LLM_EXPORT_PATH)
|
||||
model.language_model.save_pretrained(LLM_EXPORT_PATH)
|
||||
```
|
||||
|
||||
Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
|
||||
```bash
|
||||
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
|
||||
...
|
||||
$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
|
||||
```
|
||||
|
||||
|
||||
### 4. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. Sample usage:
|
||||
|
||||
Note - the test image shown below can be found [here](https://github-production-user-asset-6210df.s3.amazonaws.com/10740300/415512792-d90d5562-8844-4f34-a0a5-77f62d5a58b5.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20250221%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250221T054145Z&X-Amz-Expires=300&X-Amz-Signature=86c60be490aa49ef7d53f25d6c973580a8273904fed11ed2453d0a38240ee40a&X-Amz-SignedHeaders=host).
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
|
||||
--mmproj $VISUAL_GGUF_PATH \
|
||||
--image cherry_blossom.jpg \
|
||||
-c 16384 \
|
||||
-p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat type of flowers are in this picture?\n<|assistant|>\n" \
|
||||
--temp 0
|
||||
```
|
||||
|
||||
Sample response: `The flowers in the picture are cherry blossoms, which are known for their delicate pink petals and are often associated with the beauty of spring.`
|
||||
|
|
@ -43,6 +43,7 @@
|
|||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <cinttypes>
|
||||
|
|
@ -123,6 +124,7 @@ static std::string format(const char * fmt, ...) {
|
|||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
|
|
@ -447,8 +449,9 @@ struct clip_hparams {
|
|||
|
||||
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
||||
|
||||
int32_t image_grid_pinpoints[32];
|
||||
std::vector<int32_t> image_grid_pinpoints;
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
|
|
@ -588,6 +591,7 @@ struct clip_ctx {
|
|||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
int32_t max_feature_layer;
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
|
|
@ -654,7 +658,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
|
|
@ -755,13 +758,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
|
||||
std::vector<struct ggml_tensor *> embedding_stack;
|
||||
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
||||
n_layer += 1;
|
||||
}
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
for (int il = 0; il < ctx->max_feature_layer; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(embeddings);
|
||||
}
|
||||
|
||||
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
|
||||
|
||||
// layernorm1
|
||||
|
|
@ -849,7 +858,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
||||
embeddings = cur;
|
||||
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
|
|
@ -860,6 +868,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
||||
}
|
||||
|
||||
// final layer is a vision feature layer
|
||||
if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(embeddings);
|
||||
}
|
||||
|
||||
// If feature layers are explicitly set, stack them (if we have multiple)
|
||||
if (!embedding_stack.empty()) {
|
||||
embeddings = embedding_stack[0];
|
||||
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
||||
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
// llava projector
|
||||
if (ctx->has_llava_projector) {
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
|
@ -1455,14 +1476,26 @@ if(enable_gpu_clip)
|
|||
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
|
||||
int n = gguf_get_arr_n(ctx, idx);
|
||||
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
||||
for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
|
||||
hparams.image_grid_pinpoints[i] = pinpoints[i];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
|
||||
}
|
||||
if (n < 32)
|
||||
hparams.image_grid_pinpoints[n] = 0;
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
hparams.image_grid_pinpoints[0]=0;
|
||||
}
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
|
||||
// Load the vision feature layer indices if they are explicitly provided;
|
||||
// if multiple vision feature layers are present, the values will be concatenated
|
||||
// to form the final visual features.
|
||||
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
||||
// be non-negative, since we use -1 to mark values as unset here.
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
|
||||
int n = gguf_get_arr_n(ctx, idx);
|
||||
|
||||
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
hparams.vision_feature_layer.insert(vision_feature_layer[i]);
|
||||
}
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
||||
|
|
@ -1488,6 +1521,9 @@ if(enable_gpu_clip)
|
|||
new_clip->image_std[i] = std_data[i];
|
||||
}
|
||||
|
||||
// Calculate the deepest feature layer based on hparams and projector type
|
||||
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
|
||||
|
||||
if (verbosity >= 2) {
|
||||
LOG_INF("\n%s: vision model hparams\n", __func__);
|
||||
LOG_INF("image_size %d\n", hparams.image_size);
|
||||
|
|
@ -1501,8 +1537,13 @@ if(enable_gpu_clip)
|
|||
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
LOG_INF("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
||||
for (const auto & pp : hparams.image_grid_pinpoints) {
|
||||
LOG_INF("%d ", pp);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("v_vision_feature_layer: ");
|
||||
for (const auto & feature_layer: hparams.vision_feature_layer) {
|
||||
LOG_INF("%d ", feature_layer);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
|
|
@ -1741,11 +1782,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
|||
}
|
||||
}
|
||||
|
||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
||||
img->nx = nx;
|
||||
img->ny = ny;
|
||||
img->buf.resize(3 * nx * ny);
|
||||
memcpy(img->buf.data(), data, img->buf.size());
|
||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||
}
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
|
|
@ -1755,7 +1796,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -1846,14 +1887,14 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||
uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height);
|
||||
if(letterboxed_image!=nullptr)
|
||||
{
|
||||
build_clip_img_from_data(letterboxed_image, new_width, new_height, img);
|
||||
clip_build_img_from_pixels(letterboxed_image, new_width, new_height, img);
|
||||
free(letterboxed_image);
|
||||
letterboxed_image = nullptr;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
}
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
|
|
@ -2334,10 +2375,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
}
|
||||
}
|
||||
} else {
|
||||
if (params.image_grid_pinpoints[0] != 0) {
|
||||
if (!params.image_grid_pinpoints.empty()) {
|
||||
// "spatial_unpad" with "anyres" processing for llava-1.6
|
||||
std::vector<std::pair<int, int>> possible_resolutions;
|
||||
for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
|
||||
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
|
||||
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
||||
}
|
||||
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
||||
|
|
@ -2503,7 +2544,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|||
}
|
||||
|
||||
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints;
|
||||
if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
|
||||
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
|
|
@ -3038,6 +3086,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|||
return ctx->has_qwen2vl_merger;
|
||||
}
|
||||
|
||||
// Determine the number of encoder layers to iterate over
|
||||
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
|
||||
// Get the index of the second to last layer; this is the
|
||||
// default for models that have a llava projector
|
||||
const auto & hparams = ctx->vision_model.hparams;
|
||||
int n_layer = hparams.n_layer - 1;
|
||||
int deepest_feature_layer = -1;
|
||||
|
||||
// Handle other projectors; incrementing here indicates that we
|
||||
// should use the last encoder layer for the vision features.
|
||||
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
||||
n_layer += 1;
|
||||
}
|
||||
|
||||
// If we set explicit vision feature layers, only go up to the deepest one
|
||||
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||
if (feature_layer > deepest_feature_layer) {
|
||||
deepest_feature_layer = feature_layer;
|
||||
}
|
||||
}
|
||||
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
||||
}
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
clip_image_f32 clip_img;
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
|||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
|
@ -73,6 +74,9 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
|||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */
|
||||
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
|
|
@ -89,11 +93,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
|||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API void set_clip_uses_gpu(bool usegpu);
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import re
|
|||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
|
||||
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
|
|
@ -37,6 +37,18 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
|
|||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
# Standardize the transformers llava next keys for
|
||||
# image newline / mm projector with the classes in haotian-liu LLaVA
|
||||
if name == "image_newline":
|
||||
return "model.image_newline"
|
||||
if name.startswith("multi_modal_projector"):
|
||||
name = name.replace("multi_modal_projector", "mm")
|
||||
if "linear_1" in name:
|
||||
name = name.replace("linear_1", "0")
|
||||
if "linear_2" in name:
|
||||
name = name.replace("linear_2", "2")
|
||||
return name
|
||||
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
|
|
@ -83,8 +95,14 @@ ap.add_argument("--vision-only", action="store_true", required=False,
|
|||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
|
||||
# Selectable visual encoders that are compatible with this script
|
||||
encoder_group = ap.add_mutually_exclusive_group()
|
||||
encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
|
||||
help="the visual encoder is Siglip.")
|
||||
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
|
|
@ -109,7 +127,12 @@ if args.use_f32:
|
|||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
if (
|
||||
args.clip_model_is_vision or
|
||||
not os.path.exists(dir_model + "/vocab.json") or
|
||||
args.clip_model_is_openclip or
|
||||
args.clip_model_is_siglip
|
||||
):
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
|
|
@ -137,7 +160,10 @@ ftype = 1
|
|||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
if args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
if args.clip_model_is_siglip:
|
||||
model = SiglipVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
elif args.clip_model_is_vision or args.clip_model_is_openclip:
|
||||
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
else:
|
||||
|
|
@ -187,26 +213,71 @@ else:
|
|||
if has_text_encoder:
|
||||
assert t_hparams is not None
|
||||
assert tokens is not None
|
||||
if args.clip_model_is_siglip:
|
||||
text_projection_dim = 0
|
||||
else:
|
||||
text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
|
||||
# text_model hparams
|
||||
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
|
||||
fout.add_uint32("clip.text.projection_dim", text_projection_dim)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
||||
fout.add_token_list(tokens)
|
||||
|
||||
|
||||
|
||||
def get_non_negative_vision_feature_layers(v_hparams):
|
||||
"""
|
||||
Determine the vision feature layer(s) for the llava model, which are indices into the
|
||||
hidden states of the visual encoder. Note that the hidden states array generally takes the
|
||||
form:
|
||||
|
||||
[<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
|
||||
|
||||
so feature indices should be offset as n+1 to get the output of encoder block n.
|
||||
We convert all vision feature layers to non-negative so that -1 can be used in
|
||||
the model as an unset value. If no vision feature layer is found, we leave it unset.
|
||||
"""
|
||||
num_hidden_layers = v_hparams["num_hidden_layers"]
|
||||
to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
|
||||
feature_layers_key = None
|
||||
# Key used for llava models in transformers
|
||||
if "vision_feature_layer" in config:
|
||||
feature_layers_key = "vision_feature_layer"
|
||||
# Key used for llava models in the original format
|
||||
elif "mm_vision_select_layer" in config:
|
||||
feature_layers_key = "mm_vision_select_layer"
|
||||
if feature_layers_key is not None:
|
||||
feature_layers = config[feature_layers_key]
|
||||
if isinstance(feature_layers, int):
|
||||
feature_layers = [feature_layers]
|
||||
return [to_non_negative(feature_layer) for feature_layer in feature_layers]
|
||||
|
||||
# Determine if we have explicitly specified vision feature layers in our config
|
||||
feature_layers = get_non_negative_vision_feature_layers(v_hparams)
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams
|
||||
# Siglip does not have a visual projector; set projection dim to 0
|
||||
if args.clip_model_is_siglip:
|
||||
visual_projection_dim = 0
|
||||
else:
|
||||
visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
|
||||
|
||||
# set vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
|
||||
fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
|
||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||
if feature_layers:
|
||||
block_count = max(feature_layers)
|
||||
else:
|
||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
# /**
|
||||
# "image_grid_pinpoints": [
|
||||
|
|
@ -258,7 +329,8 @@ if has_vision_encoder:
|
|||
fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
|
||||
if "mm_projector_type" in v_hparams:
|
||||
fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
|
||||
|
||||
if feature_layers:
|
||||
fout.add_array("clip.vision.feature_layer", feature_layers)
|
||||
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
|
||||
|
|
@ -274,7 +346,13 @@ fout.add_bool("clip.use_gelu", use_gelu)
|
|||
|
||||
|
||||
if has_llava_projector:
|
||||
model.vision_model.encoder.layers.pop(-1)
|
||||
# By default, we drop the last layer for llava projector
|
||||
# models unless we have explicitly set vision feature layers
|
||||
if feature_layers is None:
|
||||
model.vision_model.encoder.layers.pop(-1)
|
||||
else:
|
||||
model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
|
||||
|
||||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
|
|
|
|||
|
|
@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
||||
|
||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
|
||||
for (size_t i = 0; i < num_gridpoints; i += 2) {
|
||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
||||
}
|
||||
|
||||
|
|
@ -405,7 +406,8 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|||
}
|
||||
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
int num_max_patches = 6;
|
||||
// Granite vision uses up to 10 patches + base patch
|
||||
int num_max_patches = 11;
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
num_max_patches = 10;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,6 +33,33 @@ def save_model(model, file_path, file_type):
|
|||
else:
|
||||
torch.save(model, file_path)
|
||||
|
||||
# Helpers to match weight names from specific components or
|
||||
# determine if a saved shard contains that component
|
||||
def is_vision_tower(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.vision_tower") or
|
||||
weight_name.startswith("vit.") or
|
||||
weight_name.startswith("vision_tower")
|
||||
)
|
||||
|
||||
def is_newline(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.image_newline") or
|
||||
weight_name.startswith("image_newline")
|
||||
)
|
||||
|
||||
def is_mm_projector(weight_name):
|
||||
return (
|
||||
weight_name.startswith("model.mm_projector") or
|
||||
weight_name.startswith("vision_proj.") or
|
||||
weight_name.startswith("multi_modal_projector")
|
||||
)
|
||||
|
||||
def newline_criteria(checkpoint):
|
||||
return any(is_newline(k) for k in checkpoint.keys())
|
||||
|
||||
def proj_criteria(checkpoint):
|
||||
return any(is_mm_projector(k) for k in checkpoint.keys())
|
||||
|
||||
# Adapted function to clean vision tower from checkpoint
|
||||
def clean_vision_tower_from_checkpoint(checkpoint_path):
|
||||
|
|
@ -40,7 +67,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
|
|||
# file_type = 'pytorch'
|
||||
model_path = os.path.dirname(checkpoint_path)
|
||||
print(f"Searching for vision tower tensors in {checkpoint_path}")
|
||||
clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
|
||||
clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
|
||||
|
||||
if len(clip_tensors) > 0:
|
||||
print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
|
||||
|
|
@ -84,12 +111,6 @@ def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
|
|||
|
||||
return newline_checkpoint_path, projector_checkpoint_path
|
||||
|
||||
def newline_criteria(checkpoint):
|
||||
return any(k.startswith("model.image_newline") for k in checkpoint.keys())
|
||||
|
||||
def proj_criteria(checkpoint):
|
||||
return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
|
||||
|
||||
|
||||
# Command-line interface setup
|
||||
ap = argparse.ArgumentParser()
|
||||
|
|
@ -123,14 +144,14 @@ first_checkpoint = None
|
|||
if newline_checkpoint_path is not None:
|
||||
print(f"Taking newline from {newline_checkpoint_path}")
|
||||
first_checkpoint, file_type = load_model(newline_checkpoint_path)
|
||||
first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
|
||||
first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
|
||||
|
||||
# Load the checkpoint
|
||||
mm_tensors = []
|
||||
last_checkpoint = None
|
||||
if projector_checkpoint_path is not None:
|
||||
last_checkpoint, file_type = load_model(projector_checkpoint_path)
|
||||
mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
|
||||
mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
|
||||
|
||||
if len(mm_tensors) == 0:
|
||||
if last_checkpoint is not None:
|
||||
|
|
@ -155,5 +176,5 @@ if len(projector) > 0:
|
|||
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||
|
|
|
|||
|
|
@ -521,8 +521,13 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|||
throw std::runtime_error("Only one completion choice is allowed");
|
||||
}
|
||||
|
||||
// Handle "echo" field
|
||||
if (json_value(body, "echo", false)) {
|
||||
throw std::runtime_error("Only no echo is supported");
|
||||
}
|
||||
|
||||
// Params supported by OAI but unsupported by llama.cpp
|
||||
static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
|
||||
static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
|
||||
for (const auto & param : unsupported_params) {
|
||||
if (body.contains(param)) {
|
||||
throw std::runtime_error("Unsupported param: " + param);
|
||||
|
|
@ -598,7 +603,7 @@ static json oaicompat_completion_params_parse(
|
|||
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
||||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||
inputs.grammar = grammar;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||
inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue