examples : add model conversion tool/example (#15455)

* examples : add model conversion tool/example

This commit adds an "example/tool" that is intended to help in the
process of converting models to GGUF. Currently it supports normal
causal models and embedding models. The readme contains instructions and
command to guide through the process.

The motivation for this to have a structured and repeatable process for
model conversions and hopefully with time improve upon it to make the
process easier and more reliable. We have started to use this for new
model conversions internally and will continue doing so and improve it
as we go along. Perhaps with time this should be placed in a different
directory than the examples directory, but for now it seems like a good
place to keep it while we are still developing it.

* squash! examples : add model conversion tool/example

Remove dependency on scikit-learn in model conversion example.

* squash! examples : add model conversion tool/example

Update transformer dep to use non-dev version. And also import
`AutoModelForCausalLM` instead of `AutoModel` to ensure compatibility
with the latest version.

* squash! examples : add model conversion tool/example

Remove the logits requirements file from the all requirements file.
This commit is contained in:
Daniel Bevenius 2025-08-21 12:16:54 +02:00 committed by GitHub
parent b108e42904
commit 2758fa10da
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 2230 additions and 0 deletions

View file

@ -0,0 +1,42 @@
#/bin/bash
set -e
MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
else
# Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
python3 -c "
import json
import sys
import struct
data = json.load(sys.stdin)
# Flatten all embeddings completely
flattened = []
for item in data:
embedding = item['embedding']
for token_embedding in embedding:
flattened.extend(token_embedding)
print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
# Write as binary floats - matches logitc.cpp fwrite format
with open('$TEMP_FILE', 'wb') as f:
for value in flattened:
f.write(struct.pack('f', value))
"
CPP_EMBEDDINGS="$TEMP_FILE"
trap "rm -f $TEMP_FILE" EXIT
fi
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
--cpp-embeddings $CPP_EMBEDDINGS \
--prompt "Hello world today"

View file

@ -0,0 +1,22 @@
#!/bin/bash
set -e
MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
echo "Model path: ${EMBEDDING_MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
echo "Data type: ${TYPE}"
echo "Converted model path:: ${CONVERTED_MODEL}"
python ../../convert_hf_to_gguf.py --verbose \
${EMBEDDING_MODEL_PATH} \
--outfile ${CONVERTED_MODEL} \
--outtype ${TYPE}
echo ""
echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"

View file

@ -0,0 +1,20 @@
#!/bin/bash
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"

View file

@ -0,0 +1,116 @@
#!/usr/bin/env python3
import argparse
import os
import numpy as np
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModel
import torch
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
tokenizer = AutoTokenizer.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}Model"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
model = AutoModel.from_pretrained(model_path)
print(f"Model class: {type(model)}")
#print(f"Model file: {type(model).__module__}")
config = AutoConfig.from_pretrained(model_path)
model_name = os.path.basename(model_path)
texts = [ "Hello world today" ]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
tokens = encoded['input_ids'][0]
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
with torch.no_grad():
outputs = model(**encoded)
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
# Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
print(f"Hidden states shape: {hidden_states.shape}")
print(f"All embeddings shape: {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1]}")
# Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
n_embd = all_embeddings.shape[1]
n_embd_count = all_embeddings.shape[0]
print() # Empty line to match C++ output
for j in range(n_embd_count):
embedding = all_embeddings[j]
print(f"embedding {j}: ", end="")
# Print first 3 values
for i in range(min(3, n_embd)):
print(f"{embedding[i]:9.6f} ", end="")
print(" ... ", end="")
# Print last 3 values
for i in range(n_embd - 3, n_embd):
print(f"{embedding[i]:9.6f} ", end="")
print() # New line
print() # Final empty line to match C++ output
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
# Save all embeddings flattened (matching what embedding.cpp would save if it did)
flattened_embeddings = all_embeddings.flatten()
flattened_embeddings.astype(np.float32).tofile(bin_filename)
with open(txt_filename, "w") as f:
f.write(f"# Model class: {model_name}\n")
f.write(f"# Tokens: {token_strings}\n")
f.write(f"# Shape: {all_embeddings.shape}\n")
f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
for j in range(n_embd_count):
f.write(f"# Token {j} ({token_strings[j]}):\n")
for i, value in enumerate(all_embeddings[j]):
f.write(f"{j}_{i}: {value:.6f}\n")
f.write("\n")
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
print("")
print(f"Saved bin embeddings to: {bin_filename}")
print(f"Saved txt embeddings to: {txt_filename}")