mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-25 15:03:46 +00:00
Deploy CDX-targeted crawl for PubMed + dermatology domains via Cloud Scheduler. Uses static Bearer auth (brain server API key) instead of OIDC since Cloud Run allows unauthenticated access and brain's auth rejects long JWT tokens. Jobs: brain-crawl-medical (daily 2AM, 100 pages), brain-crawl-derm (daily 3AM, 50 pages), brain-partition-cache (hourly graph rebuild). Tested: 10 new memories injected from first run (1568->1578). CDX falls back to Wayback API from Cloud Run. ADR-118 Phase 1 implementation. Co-Authored-By: claude-flow <ruv@ruv.net>
108 lines
5 KiB
Bash
Executable file
108 lines
5 KiB
Bash
Executable file
#!/bin/bash
|
|
# Deploy Common Crawl Phase 1 scheduler jobs (ADR-118)
|
|
# Medical domain only: $11-28/month budget
|
|
#
|
|
# Auth: The brain server uses Bearer token auth (any token 8-256 chars).
|
|
# Cloud Scheduler sends the token via --headers instead of OIDC
|
|
# because Cloud Run's allow-unauthenticated + brain's own auth
|
|
# means OIDC JWTs (>256 chars) get rejected.
|
|
set -euo pipefail
|
|
|
|
PROJECT="${1:-ruv-dev}"
|
|
REGION="us-central1"
|
|
BRAIN_URL="https://pi.ruv.io"
|
|
BEARER_TOKEN="brain-crawl-phase1-scheduler"
|
|
|
|
echo "=== Common Crawl Phase 1 Deployment ==="
|
|
echo "Project: ${PROJECT}"
|
|
echo "Budget: \$11-28/month"
|
|
echo ""
|
|
|
|
# Job 1: Medical domain crawl (daily 2AM, 100 pages — API caps at 100/request)
|
|
echo "Creating/updating brain-crawl-medical..."
|
|
gcloud scheduler jobs create http brain-crawl-medical \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="0 2 * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/crawl/discover" \
|
|
--http-method=POST \
|
|
--headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"domain_pattern":"*.pubmed.ncbi.nlm.nih.gov/*","crawl_index":"CC-MAIN-2026-08","limit":100,"category":"medical","tags":["pubmed","medical","phase-1"],"inject":true}' \
|
|
--description="Phase 1: Medical domain crawl (ADR-118)" \
|
|
2>/dev/null || \
|
|
gcloud scheduler jobs update http brain-crawl-medical \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="0 2 * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/crawl/discover" \
|
|
--http-method=POST \
|
|
--update-headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"domain_pattern":"*.pubmed.ncbi.nlm.nih.gov/*","crawl_index":"CC-MAIN-2026-08","limit":100,"category":"medical","tags":["pubmed","medical","phase-1"],"inject":true}' \
|
|
--description="Phase 1: Medical domain crawl (ADR-118)"
|
|
|
|
echo " brain-crawl-medical OK"
|
|
|
|
# Job 2: Dermatology crawl (daily 3AM, 50 pages)
|
|
echo "Creating/updating brain-crawl-derm..."
|
|
gcloud scheduler jobs create http brain-crawl-derm \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="0 3 * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/crawl/discover" \
|
|
--http-method=POST \
|
|
--headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"domain_pattern":"*.dermnetnz.org/*","crawl_index":"CC-MAIN-2026-08","limit":50,"category":"medical","tags":["dermatology","skin-cancer","phase-1"],"inject":true}' \
|
|
--description="Phase 1: Dermatology domain crawl (ADR-118)" \
|
|
2>/dev/null || \
|
|
gcloud scheduler jobs update http brain-crawl-derm \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="0 3 * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/crawl/discover" \
|
|
--http-method=POST \
|
|
--update-headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"domain_pattern":"*.dermnetnz.org/*","crawl_index":"CC-MAIN-2026-08","limit":50,"category":"medical","tags":["dermatology","skin-cancer","phase-1"],"inject":true}' \
|
|
--description="Phase 1: Dermatology domain crawl (ADR-118)"
|
|
|
|
echo " brain-crawl-derm OK"
|
|
|
|
# Job 3: Partition cache recompute (hourly)
|
|
echo "Creating/updating brain-partition-cache..."
|
|
gcloud scheduler jobs create http brain-partition-cache \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="5 * * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/optimize" \
|
|
--http-method=POST \
|
|
--headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"actions":["rebuild_graph"]}' \
|
|
--description="Hourly partition cache recompute (ADR-118)" \
|
|
2>/dev/null || \
|
|
gcloud scheduler jobs update http brain-partition-cache \
|
|
--project="${PROJECT}" \
|
|
--location="${REGION}" \
|
|
--schedule="5 * * * *" \
|
|
--uri="${BRAIN_URL}/v1/pipeline/optimize" \
|
|
--http-method=POST \
|
|
--update-headers="Content-Type=application/json,Authorization=Bearer ${BEARER_TOKEN}" \
|
|
--message-body='{"actions":["rebuild_graph"]}' \
|
|
--description="Hourly partition cache recompute (ADR-118)"
|
|
|
|
echo " brain-partition-cache OK"
|
|
|
|
echo ""
|
|
echo "=== Deployed Jobs ==="
|
|
gcloud scheduler jobs list --project="${PROJECT}" --location="${REGION}" --filter="name~brain-crawl OR name~brain-partition" 2>/dev/null
|
|
echo ""
|
|
echo "=== Verification ==="
|
|
echo "Pipeline metrics:"
|
|
curl -s "${BRAIN_URL}/v1/pipeline/metrics" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps(d, indent=2))" 2>/dev/null || echo "(no metrics yet)"
|
|
echo ""
|
|
echo "Crawl stats:"
|
|
curl -s "${BRAIN_URL}/v1/pipeline/crawl/stats" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps(d, indent=2))" 2>/dev/null || echo "(no crawl stats yet)"
|
|
echo ""
|
|
echo "Brain status:"
|
|
curl -s "${BRAIN_URL}/v1/status" | python3 -c "import sys,json; d=json.load(sys.stdin); print(f'Memories: {d[\"total_memories\"]}, Graph: {d[\"graph_edges\"]} edges, Sparsifier: {d[\"sparsifier_compression\"]:.1f}x')"
|
|
echo ""
|
|
echo "Phase 1 deployed. Estimated cost: \$11-28/month."
|
|
echo "Run 'gcloud scheduler jobs run brain-crawl-medical --project=${PROJECT} --location=${REGION}' to trigger immediately."
|