#!/bin/bash # Historical Common Crawl evolutionary comparison importer # Queries the same medical domains across quarterly crawl snapshots 2020-2026 # ADR-119 implementation set -euo pipefail BRAIN_URL="${BRAIN_URL:-https://pi.ruv.io}" AUTH_HEADER="Authorization: Bearer ruvector-crawl-2026" LIMIT="${LIMIT:-50}" # pages per domain per crawl # Target domains for medical/dermatology evolution tracking DOMAINS=( "aad.org" "dermnetnz.org" "skincancer.org" "cancer.org" "melanoma.org" ) # Quarterly crawl snapshots (2020-2026) CRAWLS=( "CC-MAIN-2020-16" "CC-MAIN-2020-50" "CC-MAIN-2021-17" "CC-MAIN-2021-43" "CC-MAIN-2022-05" "CC-MAIN-2022-33" "CC-MAIN-2023-06" "CC-MAIN-2023-40" "CC-MAIN-2024-10" "CC-MAIN-2024-42" "CC-MAIN-2025-13" "CC-MAIN-2025-40" "CC-MAIN-2026-06" "CC-MAIN-2026-08" ) echo "=== Historical Common Crawl Evolutionary Import ===" echo "Brain: ${BRAIN_URL}" echo "Domains: ${#DOMAINS[@]}" echo "Crawls: ${#CRAWLS[@]} quarterly snapshots (2020-2026)" echo "Limit: ${LIMIT} pages/domain/crawl" echo "" TOTAL_IMPORTED=0 TOTAL_ERRORS=0 for crawl in "${CRAWLS[@]}"; do echo "--- Crawl: ${crawl} ---" for domain in "${DOMAINS[@]}"; do echo -n " ${domain}: " # Call the brain's crawl discover endpoint RESULT=$(curl -s -X POST "${BRAIN_URL}/v1/pipeline/crawl/discover" \ -H "Content-Type: application/json" \ -H "${AUTH_HEADER}" \ -d "{ \"domain_pattern\": \"*.${domain}/*\", \"crawl_index\": \"${crawl}\", \"limit\": ${LIMIT}, \"filters\": {\"language\": \"en\", \"min_length\": 500} }" \ --max-time 60 2>/dev/null || echo '{"error":"timeout"}') # Parse result DISCOVERED=$(echo "$RESULT" | python3 -c " import sys,json try: d=json.load(sys.stdin) if 'error' in d: print(f'ERROR: {d[\"error\"]}') else: count = d.get('discovered', d.get('total', d.get('returned', 0))) print(f'{count} pages') except: print('parse error') " 2>/dev/null || echo "?") echo "${DISCOVERED}" # Rate limit: 2 seconds between requests sleep 2 done echo "" done echo "=== Import Complete ===" echo "" # Check final brain state echo "=== Brain State After Import ===" curl -s "${BRAIN_URL}/v1/status" | python3 -c " import sys,json; d=json.load(sys.stdin) print(f'Memories: {d[\"total_memories\"]}') print(f'Graph: {d[\"graph_edges\"]} edges') print(f'Sparsifier: {d[\"sparsifier_compression\"]:.1f}x') print(f'Clusters: {d[\"cluster_count\"]}') "