examples : add model conversion tool/example (#15455)

* examples : add model conversion tool/example

This commit adds an "example/tool" that is intended to help in the
process of converting models to GGUF. Currently it supports normal
causal models and embedding models. The readme contains instructions and
command to guide through the process.

The motivation for this to have a structured and repeatable process for
model conversions and hopefully with time improve upon it to make the
process easier and more reliable. We have started to use this for new
model conversions internally and will continue doing so and improve it
as we go along. Perhaps with time this should be placed in a different
directory than the examples directory, but for now it seems like a good
place to keep it while we are still developing it.

* squash! examples : add model conversion tool/example

Remove dependency on scikit-learn in model conversion example.

* squash! examples : add model conversion tool/example

Update transformer dep to use non-dev version. And also import
`AutoModelForCausalLM` instead of `AutoModel` to ensure compatibility
with the latest version.

* squash! examples : add model conversion tool/example

Remove the logits requirements file from the all requirements file.
This commit is contained in:
Daniel Bevenius 2025-08-21 12:16:54 +02:00 committed by GitHub
parent b108e42904
commit 2758fa10da
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 2230 additions and 0 deletions

View file

@ -0,0 +1,43 @@
#/bin/bash
set -e
MODEL_PATH="${1:-"$MODEL_PATH"}"
MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
if [ -t 0 ]; then
CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
else
# Process piped JSON data and convert to binary (matching logits.cpp format)
TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
python3 -c "
import json
import sys
import struct
data = json.load(sys.stdin)
# Flatten all embeddings completely
flattened = []
for item in data:
embedding = item['embedding']
for token_embedding in embedding:
flattened.extend(token_embedding)
print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
# Write as binary floats - matches logitc.cpp fwrite format
with open('$TEMP_FILE', 'wb') as f:
for value in flattened:
f.write(struct.pack('f', value))
"
CPP_EMBEDDINGS="$TEMP_FILE"
trap "rm -f $TEMP_FILE" EXIT
fi
python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
--python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
--cpp-embeddings $CPP_EMBEDDINGS \
--prompt "Hello world today" \
--causal

View file

@ -0,0 +1,88 @@
#!/usr/bin/env python3
import numpy as np
import sys
import os
from pathlib import Path
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""
try:
pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
except Exception as e:
print(f"❌ NOK: Failed to load files - {e}")
return False
# Check shapes match
if pytorch_logits.shape != llamacpp_logits.shape:
print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
return False
# Calculate key metrics
diff = pytorch_logits - llamacpp_logits
abs_diff = np.abs(diff)
max_diff = np.max(abs_diff)
# Get top 10 predictions from both models
pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
print(f"Max absolute difference: {max_diff:.4f}")
if max_diff > 1.0:
print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
return False
return True
def main():
model_path = os.getenv('MODEL_PATH')
if not model_path:
print("Error: MODEL_PATH environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
model_name = os.path.splitext(os.path.basename(model_path))[0]
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
if not pytorch_file.exists():
print(f"Error: PyTorch logits file not found: {pytorch_file}")
print("Please run scripts/run-org-model.sh first to generate this file.")
sys.exit(1)
if not llamacpp_file.exists():
print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
print("Please run scripts/run-converted-model.sh first to generate this file.")
sys.exit(1)
print("Checked all required files were found. Proceeding...\n")
print("🔍 GGML Model Validation for model ", model_name)
print("=" * 40)
print(f"PyTorch logits : {pytorch_file}")
print(f"llama.cpp logits: {llamacpp_file}")
print()
success = quick_logits_check(pytorch_file, llamacpp_file)
# Exit with appropriate code
if success:
print("✅ OK: Lightweight model check successful!")
print(" Ok to proceed with NMSE check...")
sys.exit(0)
else:
print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
sys.exit(1)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,22 @@
#!/bin/bash
MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
echo "Model path: ${MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
echo "Data type: ${TYPE}"
echo "Converted model path:: ${CONVERTED_MODEL}"
echo "Metadata override: ${METADATA_OVERRIDE}"
python ../../convert_hf_to_gguf.py --verbose \
${MODEL_PATH} \
--outfile ${CONVERTED_MODEL} \
--outtype ${TYPE} \
--metadata "${METADATA_OVERRIDE}"
echo ""
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"

View file

@ -0,0 +1,113 @@
#!/usr/bin/env python3
import argparse
import os
import importlib
import sys
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM
from pathlib import Path
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
config = AutoConfig.from_pretrained(model_path)
print("Model type: ", config.model_type)
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
print(f"Model class: {type(model)}")
#print(f"Model file: {type(model).__module__}")
model_name = os.path.basename(model_path)
print(f"Model name: {model_name}")
prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True)
# Extract hidden states from the last layer
# outputs.hidden_states is a tuple of (num_layers + 1) tensors
# Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
last_hidden_states = outputs.hidden_states[-1]
# Get embeddings for all tokens
token_embeddings = last_hidden_states[0].cpu().numpy() # Remove batch dimension
print(f"Hidden states shape: {last_hidden_states.shape}")
print(f"Token embeddings shape: {token_embeddings.shape}")
print(f"Hidden dimension: {token_embeddings.shape[-1]}")
print(f"Number of tokens: {token_embeddings.shape[0]}")
# Save raw token embeddings
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
# Save all token embeddings as binary
print(token_embeddings)
token_embeddings.astype(np.float32).tofile(bin_filename)
# Save as text for inspection
with open(txt_filename, "w") as f:
for i, embedding in enumerate(token_embeddings):
for j, val in enumerate(embedding):
f.write(f"{i} {j} {val:.6f}\n")
# Print embeddings per token in the requested format
print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values
if len(embedding) > 10:
# Show first 3 and last 3 values with ... in between
first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
print(f"embedding {i}: {first_vals} ... {last_vals}")
else:
# If embedding is short, show all values
vals = " ".join(f"{val:8.6f}" for val in embedding)
print(f"embedding {i}: {vals}")
# Also show token info for reference
print(f"\nToken reference:")
for i, token in enumerate(tokens):
print(f" Token {i}: {repr(token)}")
print(f"Saved bin logits to: {bin_filename}")
print(f"Saved txt logist to: {txt_filename}")

View file

@ -0,0 +1,18 @@
#!/bin/bash
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"

View file

@ -0,0 +1,20 @@
#!/bin/bash
set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"

View file

@ -0,0 +1,100 @@
#!/usr/bin/env python3
import argparse
import os
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import numpy as np
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
config = AutoConfig.from_pretrained(model_path)
print("Model type: ", config.model_type)
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
model_name = os.path.basename(model_path)
# Printing the Model class to allow for easier debugging. This can be useful
# when working with models that have not been publicly released yet and this
# migth require that the concrete class is imported and used directly instead
# of using AutoModelForCausalLM.
print(f"Model class: {model.__class__.__name__}")
prompt = "Hello, my name is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
# Extract logits for the last token (next token prediction)
last_logits = logits[0, -1, :].cpu().numpy()
print(f"Logits shape: {logits.shape}")
print(f"Last token logits shape: {last_logits.shape}")
print(f"Vocab size: {len(last_logits)}")
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}.bin"
txt_filename = data_dir / f"pytorch-{model_name}.txt"
# Save to file for comparison
last_logits.astype(np.float32).tofile(bin_filename)
# Also save as text file for easy inspection
with open(txt_filename, "w") as f:
for i, logit in enumerate(last_logits):
f.write(f"{i}: {logit:.6f}\n")
# Print some sample logits for quick verification
print(f"First 10 logits: {last_logits[:10]}")
print(f"Last 10 logits: {last_logits[-10:]}")
# Show top 5 predicted tokens
top_indices = np.argsort(last_logits)[-5:][::-1]
print("Top 5 predictions:")
for idx in top_indices:
token = tokenizer.decode([idx])
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
print(f"Saved bin logits to: {bin_filename}")
print(f"Saved txt logist to: {txt_filename}")