mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 21:25:02 +00:00
GPU-enabled Cloud Run jobs have a maximum timeout of 1 hour. The previous 7200s (2hr) setting was rejected by the API. Co-Authored-By: claude-flow <ruv@ruv.net>
236 lines
9.2 KiB
Bash
Executable file
236 lines
9.2 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Deploy RuvLTRA training pipeline to Cloud Run Jobs
|
|
# Creates: calibration, SFT training, and benchmark jobs
|
|
#
|
|
# Usage: ./scripts/training/deploy_training.sh [--project PROJECT_ID] [--region REGION]
|
|
set -euo pipefail
|
|
|
|
PROJECT_ID="${GCP_PROJECT_ID:-ruv-dev}"
|
|
REGION="${GCP_REGION:-us-central1}"
|
|
IMAGE="gcr.io/${PROJECT_ID}/ruvltra-training:latest"
|
|
SA_EMAIL="${PROJECT_ID}@appspot.gserviceaccount.com"
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--project) PROJECT_ID="$2"; IMAGE="gcr.io/${PROJECT_ID}/ruvltra-training:latest"; shift 2 ;;
|
|
--region) REGION="$2"; shift 2 ;;
|
|
*) echo "Unknown option: $1"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
echo "╔══════════════════════════════════════════════════════════════╗"
|
|
echo "║ RuvLTRA Training Pipeline — Cloud Run Deploy ║"
|
|
echo "║ Calibration · SFT · Benchmarking ║"
|
|
echo "╚══════════════════════════════════════════════════════════════╝"
|
|
echo ""
|
|
echo " Project: ${PROJECT_ID}"
|
|
echo " Region: ${REGION}"
|
|
echo " Image: ${IMAGE}"
|
|
echo ""
|
|
|
|
# --- Step 1: Build and push the training image ---
|
|
echo "▸ [1/5] Building training image..."
|
|
gcloud builds submit \
|
|
--tag="${IMAGE}" \
|
|
--project="${PROJECT_ID}" \
|
|
--timeout=1800s \
|
|
--machine-type=e2-highcpu-8 \
|
|
.
|
|
|
|
# --- Step 2: Create calibration job (imatrix + TurboQuant) ---
|
|
echo "▸ [2/5] Creating ruvltra-calibration job..."
|
|
JOB_NAME="ruvltra-calibration"
|
|
gcloud run jobs create "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=24Gi \
|
|
--cpu=4 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--upload" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1" \
|
|
2>/dev/null || \
|
|
gcloud run jobs update "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=24Gi \
|
|
--cpu=4 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--upload" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1"
|
|
|
|
echo " ✓ ${JOB_NAME} ready"
|
|
|
|
# --- Step 3: Create SFT training job (Vertex AI for larger models) ---
|
|
echo "▸ [3/5] Creating ruvltra-sft-training job..."
|
|
JOB_NAME="ruvltra-sft-training"
|
|
gcloud run jobs create "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=32Gi \
|
|
--cpu=8 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=14400s \
|
|
--args="run_sft.py,--model-id,ruvnet/ruvLTRA-7b,--corpus,data/training/corpus.jsonl,--output-dir,/tmp/sft-output" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true" \
|
|
2>/dev/null || \
|
|
gcloud run jobs update "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=32Gi \
|
|
--cpu=8 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=14400s \
|
|
--args="run_sft.py,--model-id,ruvnet/ruvLTRA-7b,--corpus,data/training/corpus.jsonl,--output-dir,/tmp/sft-output" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true"
|
|
|
|
echo " ✓ ${JOB_NAME} ready"
|
|
|
|
# --- Step 4: Create benchmark job ---
|
|
echo "▸ [4/5] Creating ruvltra-benchmark job..."
|
|
JOB_NAME="ruvltra-benchmark"
|
|
gcloud run jobs create "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=24Gi \
|
|
--cpu=4 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--benchmark-only" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1" \
|
|
2>/dev/null || \
|
|
gcloud run jobs update "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=24Gi \
|
|
--cpu=4 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="run_calibration.py,--model-id,ruvnet/ruvLTRA-7b,--benchmark-only" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1"
|
|
|
|
echo " ✓ ${JOB_NAME} ready"
|
|
|
|
# --- Step 5: Set up weekly benchmark scheduler ---
|
|
echo "▸ [5/5] Setting up weekly benchmark schedule..."
|
|
SCHEDULER_NAME="ruvltra-benchmark-weekly"
|
|
gcloud scheduler jobs create http "${SCHEDULER_NAME}" \
|
|
--location="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--schedule="0 6 * * 1" \
|
|
--time-zone="UTC" \
|
|
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-benchmark:run" \
|
|
--http-method=POST \
|
|
--oauth-service-account-email="${SA_EMAIL}" \
|
|
--description="Weekly RuvLTRA benchmark run (Mondays 06:00 UTC)" \
|
|
2>/dev/null || \
|
|
gcloud scheduler jobs update http "${SCHEDULER_NAME}" \
|
|
--location="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--schedule="0 6 * * 1" \
|
|
--time-zone="UTC" \
|
|
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-benchmark:run" \
|
|
--http-method=POST \
|
|
--oauth-service-account-email="${SA_EMAIL}" \
|
|
--description="Weekly RuvLTRA benchmark run (Mondays 06:00 UTC)"
|
|
|
|
echo " ✓ Scheduler set: every Monday at 06:00 UTC"
|
|
|
|
# --- Step 6: Create nightly training job ---
|
|
echo "▸ [6/7] Creating ruvltra-nightly-train job..."
|
|
JOB_NAME="ruvltra-nightly-train"
|
|
gcloud run jobs create "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=32Gi \
|
|
--cpu=8 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="bash,scripts/training/nightly_train.sh" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true" \
|
|
2>/dev/null || \
|
|
gcloud run jobs update "${JOB_NAME}" \
|
|
--image="${IMAGE}" \
|
|
--region="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--memory=32Gi \
|
|
--cpu=8 \
|
|
--gpu=1 \
|
|
--gpu-type=nvidia-l4 \
|
|
--max-retries=1 \
|
|
--task-timeout=3600s \
|
|
--args="bash,scripts/training/nightly_train.sh" \
|
|
--set-secrets="HF_TOKEN=huggingface-token:latest" \
|
|
--set-env-vars="PYTHONUNBUFFERED=1,WANDB_DISABLED=true"
|
|
|
|
echo " ✓ ${JOB_NAME} ready"
|
|
|
|
# --- Step 7: Set up nightly training scheduler ---
|
|
echo "▸ [7/7] Setting up nightly training schedule..."
|
|
SCHEDULER_NAME="ruvltra-nightly-train"
|
|
gcloud scheduler jobs create http "${SCHEDULER_NAME}" \
|
|
--location="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--schedule="0 3 * * *" \
|
|
--time-zone="UTC" \
|
|
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-nightly-train:run" \
|
|
--http-method=POST \
|
|
--oauth-service-account-email="${SA_EMAIL}" \
|
|
--description="Nightly RuvLTRA training from brain learnings (03:00 UTC)" \
|
|
2>/dev/null || \
|
|
gcloud scheduler jobs update http "${SCHEDULER_NAME}" \
|
|
--location="${REGION}" \
|
|
--project="${PROJECT_ID}" \
|
|
--schedule="0 3 * * *" \
|
|
--time-zone="UTC" \
|
|
--uri="https://${REGION}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/${PROJECT_ID}/jobs/ruvltra-nightly-train:run" \
|
|
--http-method=POST \
|
|
--oauth-service-account-email="${SA_EMAIL}" \
|
|
--description="Nightly RuvLTRA training from brain learnings (03:00 UTC)"
|
|
|
|
echo " ✓ Nightly training scheduled: daily at 03:00 UTC"
|
|
|
|
echo ""
|
|
echo "╔══════════════════════════════════════════════════════════════╗"
|
|
echo "║ Deployment complete! ║"
|
|
echo "║ ║"
|
|
echo "║ Run manually: ║"
|
|
echo "║ gcloud run jobs execute ruvltra-calibration --region=${REGION} ║"
|
|
echo "║ gcloud run jobs execute ruvltra-sft-training --region=${REGION} ║"
|
|
echo "║ gcloud run jobs execute ruvltra-benchmark --region=${REGION} ║"
|
|
echo "║ gcloud run jobs execute ruvltra-nightly-train --region=${REGION} ║"
|
|
echo "║ ║"
|
|
echo "║ Schedules: ║"
|
|
echo "║ Weekly benchmark: Mondays 06:00 UTC ║"
|
|
echo "║ Nightly training: Daily 03:00 UTC ║"
|
|
echo "╚══════════════════════════════════════════════════════════════╝"
|