ruvector/scripts/training/launch-gpu-training.sh
rUv 84e1886451 feat(decompiler): GPU training pipeline for neural name inference (ADR-136)
Training pipeline:
- generate-deobfuscation-data.mjs: 1,200+ training pairs from fixtures + synthetic
- train-deobfuscator.py: 6M param transformer (3 layers, 4 heads, 128 embed)
- export-to-rvf.py: PyTorch → ONNX → GGUF Q4 → RVF OVERLAY
- launch-gpu-training.sh: GCloud L4 GPU (--local, --cloud-run, --spot)
- Dockerfile.deobfuscator: pytorch/pytorch:2.2.0-cuda12.1

Decompiler integration:
- NeuralInferrer behind optional `neural` feature flag
- model_path in DecompileConfig
- Falls through to pattern-based when model unavailable
- Zero binary impact without feature flag

All tests pass, cargo check clean with and without neural feature.

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-03 02:08:19 +00:00

221 lines
7.1 KiB
Bash
Executable file

#!/bin/bash
# Launch deobfuscation model training on GCloud L4 GPU.
#
# Usage:
# ./scripts/training/launch-gpu-training.sh --local # Train on local GPU
# ./scripts/training/launch-gpu-training.sh --cloud-run # Cloud Run Job with GPU
# ./scripts/training/launch-gpu-training.sh --spot # Spot instance (cheapest)
#
# Estimated cost: ~$1.40 (on-demand) or ~$0.57 (spot) for 2-hour training.
set -euo pipefail
PROJECT="${GCP_PROJECT:-ruv-dev}"
REGION="us-central1"
ZONE="us-central1-a"
BUCKET="gs://${PROJECT}-training"
IMAGE="gcr.io/${PROJECT}/deobfuscator-trainer:latest"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
MODE="${1:---local}"
# ---------------------------------------------------------------------------
# Helper functions
# ---------------------------------------------------------------------------
log() { echo "[$(date '+%H:%M:%S')] $*"; }
check_deps() {
if [ "$MODE" != "--local" ]; then
command -v gcloud >/dev/null 2>&1 || { echo "ERROR: gcloud CLI not installed"; exit 1; }
fi
command -v python3 >/dev/null 2>&1 || { echo "ERROR: python3 not found"; exit 1; }
}
ensure_data() {
local data_path="${SCRIPT_DIR}/../../training-data.jsonl"
if [ ! -f "$data_path" ]; then
log "Generating training data..."
node "${SCRIPT_DIR}/generate-deobfuscation-data.mjs" --output "$data_path"
fi
echo "$data_path"
}
# ---------------------------------------------------------------------------
# Local training
# ---------------------------------------------------------------------------
train_local() {
log "Training locally..."
local data_path
data_path=$(ensure_data)
local output_dir="${SCRIPT_DIR}/../../model"
python3 "${SCRIPT_DIR}/train-deobfuscator.py" \
--data "$data_path" \
--output "$output_dir" \
--epochs 30 \
--batch-size 64 \
--export-onnx
log "Exporting to GGUF Q4 + RVF..."
python3 "${SCRIPT_DIR}/export-to-rvf.py" \
--checkpoint "${output_dir}/best_model.pt" \
--output "${output_dir}/deobfuscator" \
--quantize q4
log "Done. Model at ${output_dir}/deobfuscator.gguf"
}
# ---------------------------------------------------------------------------
# Cloud Run Job with GPU
# ---------------------------------------------------------------------------
train_cloud_run() {
log "Launching Cloud Run GPU job..."
# Upload training data.
local data_path
data_path=$(ensure_data)
gsutil -q cp "$data_path" "${BUCKET}/deobfuscation-data.jsonl"
log "Uploaded training data to ${BUCKET}/"
# Build and push container if needed.
if ! gcloud container images describe "$IMAGE" >/dev/null 2>&1; then
log "Building container..."
gcloud builds submit \
--tag "$IMAGE" \
--timeout=600 \
"${SCRIPT_DIR}" \
-f "${SCRIPT_DIR}/Dockerfile.deobfuscator"
fi
# Create or update the job.
gcloud run jobs create deobfuscator-train \
--image="$IMAGE" \
--task-timeout=7200 \
--max-retries=1 \
--cpu=4 \
--memory=16Gi \
--gpu=1 \
--gpu-type=nvidia-l4 \
--region="$REGION" \
--set-env-vars="DATA_PATH=/tmp/data.jsonl,OUTPUT_DIR=/tmp/model,GCS_BUCKET=${BUCKET}" \
--quiet 2>/dev/null || \
gcloud run jobs update deobfuscator-train \
--image="$IMAGE" \
--region="$REGION" \
--quiet
# Execute the job.
log "Starting training job..."
gcloud run jobs execute deobfuscator-train \
--region="$REGION" \
--wait
# Download results.
log "Downloading trained model..."
local output_dir="${SCRIPT_DIR}/../../model"
mkdir -p "$output_dir"
gsutil -q cp "${BUCKET}/models/deobfuscator/*" "$output_dir/"
log "Done. Model at ${output_dir}/"
}
# ---------------------------------------------------------------------------
# Spot instance (cheapest)
# ---------------------------------------------------------------------------
train_spot() {
log "Launching spot instance for training..."
# Upload training data and scripts.
local data_path
data_path=$(ensure_data)
gsutil -q cp "$data_path" "${BUCKET}/deobfuscation-data.jsonl"
gsutil -q cp "${SCRIPT_DIR}/train-deobfuscator.py" "${BUCKET}/train-deobfuscator.py"
gsutil -q cp "${SCRIPT_DIR}/export-to-rvf.py" "${BUCKET}/export-to-rvf.py"
log "Uploaded data and scripts to ${BUCKET}/"
# Create spot instance with startup script.
local instance_name="deobfuscator-trainer-$(date +%s)"
gcloud compute instances create "$instance_name" \
--zone="$ZONE" \
--machine-type=g2-standard-4 \
--accelerator=type=nvidia-l4,count=1 \
--maintenance-policy=TERMINATE \
--provisioning-model=SPOT \
--image-family=pytorch-latest-gpu \
--image-project=deeplearning-platform-release \
--boot-disk-size=50GB \
--scopes=storage-full \
--metadata=startup-script="$(cat <<'STARTUP_EOF'
#!/bin/bash
set -euo pipefail
export BUCKET=BUCKET_PLACEHOLDER
# Download data and scripts.
gsutil cp ${BUCKET}/deobfuscation-data.jsonl /tmp/data.jsonl
gsutil cp ${BUCKET}/train-deobfuscator.py /tmp/train-deobfuscator.py
gsutil cp ${BUCKET}/export-to-rvf.py /tmp/export-to-rvf.py
# Install dependencies.
pip install torch onnx numpy
# Train.
cd /tmp
python train-deobfuscator.py --data data.jsonl --output /tmp/model --epochs 30 --export-onnx
# Export to GGUF Q4.
python export-to-rvf.py --checkpoint /tmp/model/best_model.pt --output /tmp/model/deobfuscator --quantize q4
# Upload results.
gsutil -m cp /tmp/model/* ${BUCKET}/models/deobfuscator/
# Self-destruct.
gcloud compute instances delete $(hostname) --zone=$(curl -s http://metadata.google.internal/computeMetadata/v1/instance/zone -H "Metadata-Flavor: Google" | cut -d'/' -f4) --quiet
STARTUP_EOF
)" \
--quiet
# Replace bucket placeholder.
gcloud compute instances add-metadata "$instance_name" \
--zone="$ZONE" \
--metadata=startup-script="$(gcloud compute instances describe "$instance_name" --zone="$ZONE" --format='value(metadata.items[startup-script])' | sed "s|BUCKET_PLACEHOLDER|${BUCKET}|g")" \
--quiet
log "Spot instance '$instance_name' launched."
log "Monitor: gcloud compute instances get-serial-port-output $instance_name --zone=$ZONE"
log "Results will appear at: ${BUCKET}/models/deobfuscator/"
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
check_deps
case "$MODE" in
--local)
train_local
;;
--cloud-run)
train_cloud_run
;;
--spot)
train_spot
;;
--help|-h)
echo "Usage: $0 [--local|--cloud-run|--spot]"
echo ""
echo " --local Train on local machine (GPU or CPU)"
echo " --cloud-run Use Cloud Run Job with L4 GPU (~\$1.40)"
echo " --spot Use Compute Engine spot instance (~\$0.57)"
exit 0
;;
*)
echo "Unknown mode: $MODE"
echo "Usage: $0 [--local|--cloud-run|--spot]"
exit 1
;;
esac