ruvector/scripts/training/deploy_training.sh
rUv 58f620ca28 fix(training): use 3600s timeout for GPU Cloud Run jobs
GPU-enabled Cloud Run jobs have a maximum timeout of 1 hour.
The previous 7200s (2hr) setting was rejected by the API.

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-03-28 12:21:58 +00:00

236 lines
9.2 KiB
Bash
Executable file

#!/usr/bin/env bash
# Deploy RuvLTRA training pipeline to Cloud Run Jobs
# Creates: calibration, SFT training, and benchmark jobs
#
# Usage: ./scripts/training/deploy_training.sh [--project PROJECT_ID] [--region REGION]
set -euo pipefail
PROJECT_ID="${GCP_PROJECT_ID:-ruv-dev}"
REGION="${GCP_REGION:-us-central1}"
IMAGE="gcr.io/${PROJECT_ID}/ruvltra-training:latest"
SA_EMAIL="${PROJECT_ID}@appspot.gserviceaccount.com"
# Parse args
while [[ $# -gt 0 ]]; do
case $1 in
--project) PROJECT_ID="$2"; IMAGE="gcr.io/${PROJECT_ID}/ruvltra-training:latest"; shift 2 ;;
--region) REGION="$2"; shift 2 ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ RuvLTRA Training Pipeline — Cloud Run Deploy ║"
echo "║ Calibration · SFT · Benchmarking ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
echo " Project: ${PROJECT_ID}"
echo " Region: ${REGION}"
echo " Image: ${IMAGE}"
echo ""
# --- Step 1: Build and push the training image ---
echo "▸ [1/5] Building training image..."
gcloud builds submit \
--tag="${IMAGE}" \
--project="${PROJECT_ID}" \
--timeout=1800s \
--machine-type=e2-highcpu-8 \
.
# --- Step 2: Create calibration job (imatrix + TurboQuant) ---
echo "▸ [2/5] Creating ruvltra-calibration job..."
JOB_NAME="ruvltra-calibration"
gcloud run jobs create "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=24Gi \
--cpu=4 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--upload" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1" \
2>/dev/null || \
gcloud run jobs update "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=24Gi \
--cpu=4 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--upload" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1"
echo "${JOB_NAME} ready"
# --- Step 3: Create SFT training job (Vertex AI for larger models) ---
echo "▸ [3/5] Creating ruvltra-sft-training job..."
JOB_NAME="ruvltra-sft-training"
gcloud run jobs create "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=32Gi \
--cpu=8 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=14400s \
--args="run_sft.py,--model-id,ruvnet/ruvLTRA-7b,--corpus,data/training/corpus.jsonl,--output-dir,/tmp/sft-output" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true" \
2>/dev/null || \
gcloud run jobs update "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=32Gi \
--cpu=8 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=14400s \
--args="run_sft.py,--model-id,ruvnet/ruvLTRA-7b,--corpus,data/training/corpus.jsonl,--output-dir,/tmp/sft-output" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true"
echo "${JOB_NAME} ready"
# --- Step 4: Create benchmark job ---
echo "▸ [4/5] Creating ruvltra-benchmark job..."
JOB_NAME="ruvltra-benchmark"
gcloud run jobs create "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=24Gi \
--cpu=4 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--benchmark-only" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1" \
2>/dev/null || \
gcloud run jobs update "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=24Gi \
--cpu=4 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--benchmark-only" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1"
echo "${JOB_NAME} ready"
# --- Step 5: Set up weekly benchmark scheduler ---
echo "▸ [5/5] Setting up weekly benchmark schedule..."
SCHEDULER_NAME="ruvltra-benchmark-weekly"
gcloud scheduler jobs create http "${SCHEDULER_NAME}" \
--location="${REGION}" \
--project="${PROJECT_ID}" \
--schedule="0 6 * * 1" \
--time-zone="UTC" \
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-benchmark:run" \
--http-method=POST \
--oauth-service-account-email="${SA_EMAIL}" \
--description="Weekly RuvLTRA benchmark run (Mondays 06:00 UTC)" \
2>/dev/null || \
gcloud scheduler jobs update http "${SCHEDULER_NAME}" \
--location="${REGION}" \
--project="${PROJECT_ID}" \
--schedule="0 6 * * 1" \
--time-zone="UTC" \
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-benchmark:run" \
--http-method=POST \
--oauth-service-account-email="${SA_EMAIL}" \
--description="Weekly RuvLTRA benchmark run (Mondays 06:00 UTC)"
echo " ✓ Scheduler set: every Monday at 06:00 UTC"
# --- Step 6: Create nightly training job ---
echo "▸ [6/7] Creating ruvltra-nightly-train job..."
JOB_NAME="ruvltra-nightly-train"
gcloud run jobs create "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=32Gi \
--cpu=8 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="bash,scripts/training/nightly_train.sh" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true" \
2>/dev/null || \
gcloud run jobs update "${JOB_NAME}" \
--image="${IMAGE}" \
--region="${REGION}" \
--project="${PROJECT_ID}" \
--memory=32Gi \
--cpu=8 \
--gpu=1 \
--gpu-type=nvidia-l4 \
--max-retries=1 \
--task-timeout=3600s \
--args="bash,scripts/training/nightly_train.sh" \
--set-secrets="HF_TOKEN=huggingface-token:latest" \
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true"
echo "${JOB_NAME} ready"
# --- Step 7: Set up nightly training scheduler ---
echo "▸ [7/7] Setting up nightly training schedule..."
SCHEDULER_NAME="ruvltra-nightly-train"
gcloud scheduler jobs create http "${SCHEDULER_NAME}" \
--location="${REGION}" \
--project="${PROJECT_ID}" \
--schedule="0 3 * * *" \
--time-zone="UTC" \
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-nightly-train:run" \
--http-method=POST \
--oauth-service-account-email="${SA_EMAIL}" \
--description="Nightly RuvLTRA training from brain learnings (03:00 UTC)" \
2>/dev/null || \
gcloud scheduler jobs update http "${SCHEDULER_NAME}" \
--location="${REGION}" \
--project="${PROJECT_ID}" \
--schedule="0 3 * * *" \
--time-zone="UTC" \
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-nightly-train:run" \
--http-method=POST \
--oauth-service-account-email="${SA_EMAIL}" \
--description="Nightly RuvLTRA training from brain learnings (03:00 UTC)"
echo " ✓ Nightly training scheduled: daily at 03:00 UTC"
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ Deployment complete! ║"
echo "║ ║"
echo "║ Run manually: ║"
echo "║ gcloud run jobs execute ruvltra-calibration --region=${REGION}"
echo "║ gcloud run jobs execute ruvltra-sft-training --region=${REGION}"
echo "║ gcloud run jobs execute ruvltra-benchmark --region=${REGION}"
echo "║ gcloud run jobs execute ruvltra-nightly-train --region=${REGION}"
echo "║ ║"
echo "║ Schedules: ║"
echo "║ Weekly benchmark: Mondays 06:00 UTC ║"
echo "║ Nightly training: Daily 03:00 UTC ║"
echo "╚══════════════════════════════════════════════════════════════╝"