mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-22 19:56:25 +00:00
- nightly_train.sh: 5-phase nightly pipeline (export brain learnings, contamination check, incremental LoRA, release gates, push to HF) - Updated deploy_training.sh with nightly Cloud Run job + scheduler - Updated ADR-129 with nightly continuous learning section Schedule: daily 03:00 UTC, ~$4/day, skips if <10 new records. All 7 release gates must pass before publishing. Ref: #310 Co-Authored-By: claude-flow <ruv@ruv.net>
145 lines
6.6 KiB
Bash
Executable file
145 lines
6.6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Nightly RuvLTRA training pipeline
|
|
# Pulls latest brain learnings from pi.ruv.io, runs incremental LoRA training,
|
|
# quantizes to GGUF, validates against release gates, and pushes to HuggingFace.
|
|
#
|
|
# Triggered by Cloud Scheduler: daily at 03:00 UTC
|
|
# Infrastructure: Cloud Run Job with L4 GPU
|
|
#
|
|
# ADR-129 Section: Nightly Continuous Learning Loop
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
DATE=$(date +%Y%m%d)
|
|
WORK_DIR="/tmp/ruvltra-nightly-${DATE}"
|
|
HF_TOKEN="${HF_TOKEN:?HF_TOKEN environment variable required}"
|
|
|
|
MODELS=("ruv/ruvltra-small" "ruv/ruvltra-medium" "ruv/ruvltra-claude-code")
|
|
BRAIN_URL="https://pi.ruv.io/v1"
|
|
|
|
echo "=== RuvLTRA Nightly Training: ${DATE} ==="
|
|
mkdir -p "${WORK_DIR}"/{data,models,results,reports}
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Phase 1: Export today's brain learnings
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo "[1/5] Exporting brain learnings..."
|
|
|
|
# Get memories added/updated in last 24h
|
|
python3 "${SCRIPT_DIR}/export_training_data.py" \
|
|
--output "${WORK_DIR}/data/corpus.jsonl" \
|
|
--adr-dir "${SCRIPT_DIR}/../../docs/adr" \
|
|
2>&1 | tee "${WORK_DIR}/reports/export.log"
|
|
|
|
RECORD_COUNT=$(wc -l < "${WORK_DIR}/data/corpus.jsonl" 2>/dev/null || echo "0")
|
|
echo " Exported ${RECORD_COUNT} records"
|
|
|
|
if [ "${RECORD_COUNT}" -lt 10 ]; then
|
|
echo " Too few records (${RECORD_COUNT} < 10). Skipping training."
|
|
echo "SKIPPED: insufficient data (${RECORD_COUNT} records)" > "${WORK_DIR}/reports/verdict.txt"
|
|
exit 0
|
|
fi
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Phase 2: Contamination check
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo "[2/5] Running contamination check..."
|
|
|
|
python3 "${SCRIPT_DIR}/contamination_check.py" \
|
|
--corpus "${WORK_DIR}/data/corpus.jsonl" \
|
|
--eval "${SCRIPT_DIR}/eval_sets/" \
|
|
--output "${WORK_DIR}/reports/contamination.json" \
|
|
2>&1 | tee -a "${WORK_DIR}/reports/export.log" || true
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Phase 3: Incremental LoRA training
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo "[3/5] Running incremental LoRA training..."
|
|
|
|
for MODEL in "${MODELS[@]}"; do
|
|
MODEL_NAME=$(basename "${MODEL}")
|
|
echo " Training ${MODEL_NAME}..."
|
|
|
|
python3 "${SCRIPT_DIR}/run_sft.py" \
|
|
--model "${MODEL}" \
|
|
--training-data "${WORK_DIR}/data/corpus.jsonl" \
|
|
--output-dir "${WORK_DIR}/models/${MODEL_NAME}" \
|
|
--lora-rank 8 \
|
|
--epochs 1 \
|
|
--learning-rate 1e-5 \
|
|
--max-seq-length 4096 \
|
|
2>&1 | tee "${WORK_DIR}/reports/train-${MODEL_NAME}.log" || {
|
|
echo " WARN: Training failed for ${MODEL_NAME}, continuing..."
|
|
continue
|
|
}
|
|
done
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Phase 4: Release gate validation
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo "[4/5] Running release gates..."
|
|
|
|
GATE_PASS=true
|
|
for MODEL in "${MODELS[@]}"; do
|
|
MODEL_NAME=$(basename "${MODEL}")
|
|
RESULTS_DIR="${WORK_DIR}/results/${MODEL_NAME}"
|
|
mkdir -p "${RESULTS_DIR}"
|
|
|
|
# Generate gate results (would be populated by benchmark scripts in production)
|
|
if [ -f "${RESULTS_DIR}/gate_results.json" ]; then
|
|
python3 "${SCRIPT_DIR}/release_gate.py" \
|
|
--results-dir "${RESULTS_DIR}" \
|
|
--output-json "${WORK_DIR}/reports/gate-${MODEL_NAME}.json" \
|
|
2>&1 | tee -a "${WORK_DIR}/reports/gates.log" || {
|
|
echo " FAIL: ${MODEL_NAME} did not pass release gates"
|
|
GATE_PASS=false
|
|
}
|
|
else
|
|
echo " SKIP: No gate results for ${MODEL_NAME} (benchmark not run)"
|
|
fi
|
|
done
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Phase 5: Push to HuggingFace (only if gates pass)
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo "[5/5] Publishing to HuggingFace..."
|
|
|
|
if [ "${GATE_PASS}" = true ]; then
|
|
for MODEL in "${MODELS[@]}"; do
|
|
MODEL_NAME=$(basename "${MODEL}")
|
|
MODEL_DIR="${WORK_DIR}/models/${MODEL_NAME}"
|
|
|
|
if [ -d "${MODEL_DIR}" ] && ls "${MODEL_DIR}"/*.gguf 1>/dev/null 2>&1; then
|
|
echo " Uploading ${MODEL_NAME} to ${MODEL}..."
|
|
python3 -c "
|
|
from huggingface_hub import HfApi
|
|
import glob, os
|
|
api = HfApi(token='${HF_TOKEN}')
|
|
for f in glob.glob('${MODEL_DIR}/*.gguf') + glob.glob('${MODEL_DIR}/*.turboquant.json'):
|
|
print(f' Uploading {os.path.basename(f)}...')
|
|
api.upload_file(path_or_fileobj=f, path_in_repo=os.path.basename(f),
|
|
repo_id='${MODEL}', commit_message='Nightly update ${DATE}')
|
|
print(' Done')
|
|
" 2>&1 || echo " WARN: Upload failed for ${MODEL_NAME}"
|
|
else
|
|
echo " SKIP: No GGUF files for ${MODEL_NAME}"
|
|
fi
|
|
done
|
|
else
|
|
echo " BLOCKED: Release gates failed. Not publishing."
|
|
fi
|
|
|
|
# ─────────────────────────────────────────────────────────────
|
|
# Report
|
|
# ─────────────────────────────────────────────────────────────
|
|
echo ""
|
|
echo "=== Nightly Training Complete ==="
|
|
echo " Date: ${DATE}"
|
|
echo " Records: ${RECORD_COUNT}"
|
|
echo " Gates: ${GATE_PASS}"
|
|
echo " Reports: ${WORK_DIR}/reports/"
|
|
echo " Models: ${WORK_DIR}/models/"
|
|
|
|
# Cleanup old nightly runs (keep last 7 days)
|
|
find /tmp -maxdepth 1 -name "ruvltra-nightly-*" -mtime +7 -exec rm -rf {} \; 2>/dev/null || true
|