From 045c4c574aabda188cc0fa5932db81ceecc54eb9 Mon Sep 17 00:00:00 2001 From: rUv Date: Sun, 22 Mar 2026 00:57:58 +0000 Subject: [PATCH] feat: Cloud Run Job deployment for full 6-year Common Crawl import - Expanded domain list to 60+ medical + CS domains with categorized tagging - Cloud Run Job config: 10 parallel tasks, 100 segments per crawl - Multi-crawl orchestrator for 14 quarterly snapshots (2020-2026) - Enhanced generateTags with domain-specific labels for oncology, dermatology, ML conferences, research labs, and academic institutions - Target: 375K-500K medical/CS pages over 5 months Co-Authored-By: claude-flow --- scripts/deploy-wet-job.sh | 65 +++++++++++++++++++++++++++++++ scripts/wet-filter-inject.js | 75 ++++++++++++++++++++++++++++++++++-- scripts/wet-full-import.sh | 60 +++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 4 deletions(-) create mode 100755 scripts/deploy-wet-job.sh create mode 100755 scripts/wet-full-import.sh diff --git a/scripts/deploy-wet-job.sh b/scripts/deploy-wet-job.sh new file mode 100755 index 00000000..c23ad81c --- /dev/null +++ b/scripts/deploy-wet-job.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Deploy WET processor as Cloud Run Job for large-scale Common Crawl import +# Usage: ./deploy-wet-job.sh [PROJECT] [CRAWL_INDEX] [START_SEGMENT] [NUM_SEGMENTS] +set -euo pipefail + +PROJECT="${1:-ruv-dev}" +CRAWL_INDEX="${2:-CC-MAIN-2026-08}" +START_SEG="${3:-0}" +NUM_SEGS="${4:-100}" +REGION="us-central1" +JOB_NAME="wet-import-$(echo $CRAWL_INDEX | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)" + +echo "=== WET Cloud Run Job Deployment ===" +echo "Project: $PROJECT" +echo "Crawl: $CRAWL_INDEX" +echo "Segments: $START_SEG to $((START_SEG + NUM_SEGS - 1))" +echo "Job name: $JOB_NAME" +echo "" + +# First, upload the filter script to GCS so the job can access it +echo "--- Uploading filter script to GCS ---" +gsutil cp scripts/wet-filter-inject.js gs://ruvector-brain-dev/scripts/wet-filter-inject.js 2>&1 + +# Get the WET paths file +echo "--- Fetching WET paths ---" +PATHS_URL="https://data.commoncrawl.org/crawl-data/${CRAWL_INDEX}/wet.paths.gz" +curl -sL "$PATHS_URL" | gunzip | sed -n "$((START_SEG + 1)),$((START_SEG + NUM_SEGS))p" > /tmp/wet-paths-batch.txt +ACTUAL_SEGS=$(wc -l < /tmp/wet-paths-batch.txt) +echo "Segments to process: $ACTUAL_SEGS" + +# Upload paths file +gsutil cp /tmp/wet-paths-batch.txt gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>&1 + +# Build the domain list for the job command +DOMAIN_LIST="pubmed.ncbi.nlm.nih.gov,ncbi.nlm.nih.gov,who.int,cancer.org,aad.org,dermnetnz.org,melanoma.org,arxiv.org,acm.org,ieee.org,nature.com,nejm.org,bmj.com,mayoclinic.org,clevelandclinic.org,medlineplus.gov,cdc.gov,nih.gov,thelancet.com,sciencedirect.com,webmd.com,healthline.com,medscape.com,jamanetwork.com,frontiersin.org,plos.org,biomedcentral.com,cell.com,springer.com,cochrane.org,clinicaltrials.gov,fda.gov,mskcc.org,mdanderson.org,nccn.org,dl.acm.org,ieeexplore.ieee.org,proceedings.neurips.cc,huggingface.co,pytorch.org,tensorflow.org,cs.stanford.edu,deepmind.google,research.google,microsoft.com/research,openreview.net,paperswithcode.com,asco.org,esmo.org,dana-farber.org,cancer.net,uptodate.com,wiley.com,elsevier.com,mdpi.com,plos.org,aaai.org,usenix.org,jmlr.org,aclanthology.org" + +# Create/update the Cloud Run Job +echo "--- Creating Cloud Run Job ---" +gcloud run jobs create "$JOB_NAME" \ + --project="$PROJECT" \ + --region="$REGION" \ + --image="node:20-alpine" \ + --command="/bin/sh" \ + --args="-c,apk add --no-cache curl bash > /dev/null 2>&1 && gsutil cp gs://ruvector-brain-dev/scripts/wet-filter-inject.js /tmp/filter.js 2>/dev/null && WET_PATH=\$(gsutil cat gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>/dev/null | sed -n \"\${CLOUD_RUN_TASK_INDEX:-0}p\" | head -1) && echo \"Processing: \$WET_PATH\" && curl -sL \"https://data.commoncrawl.org/\$WET_PATH\" | gunzip | node /tmp/filter.js --brain-url https://pi.ruv.io --auth 'Authorization: Bearer ruvector-crawl-2026' --batch-size 10 --crawl-index $CRAWL_INDEX --domains '$DOMAIN_LIST'" \ + --task-count="$ACTUAL_SEGS" \ + --parallelism=10 \ + --max-retries=1 \ + --cpu=1 \ + --memory=1Gi \ + --task-timeout=3600s \ + --set-env-vars="CRAWL_INDEX=$CRAWL_INDEX" \ + 2>&1 || \ +gcloud run jobs update "$JOB_NAME" \ + --project="$PROJECT" \ + --region="$REGION" \ + --task-count="$ACTUAL_SEGS" \ + --parallelism=10 \ + 2>&1 + +echo "" +echo "--- Job created. To execute: ---" +echo "gcloud run jobs execute $JOB_NAME --project=$PROJECT --region=$REGION" +echo "" +echo "--- To monitor: ---" +echo "gcloud run jobs executions list --job=$JOB_NAME --project=$PROJECT --region=$REGION" diff --git a/scripts/wet-filter-inject.js b/scripts/wet-filter-inject.js index 329509d9..9a2bd44c 100755 --- a/scripts/wet-filter-inject.js +++ b/scripts/wet-filter-inject.js @@ -20,8 +20,51 @@ const MAX_CONTENT_LENGTH = 8000; const stats = { total: 0, filtered: 0, injected: 0, errors: 0, batched: 0 }; let batch = []; +// Default domain list: 60+ medical + CS domains +const DEFAULT_DOMAINS = [ + // Medical - Major Publishers & Journals + 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'who.int', + 'nature.com', 'nejm.org', 'bmj.com', 'thelancet.com', + 'jamanetwork.com', 'annals.org', 'sciencedirect.com', + // Medical - Clinical Resources + 'mayoclinic.org', 'clevelandclinic.org', 'medlineplus.gov', + 'cdc.gov', 'nih.gov', 'webmd.com', 'healthline.com', + 'medscape.com', 'uptodate.com', + // Medical - Oncology & Dermatology + 'cancer.org', 'aad.org', 'dermnetnz.org', 'melanoma.org', + 'asco.org', 'esmo.org', 'nccn.org', 'cancer.net', + 'mskcc.org', 'mdanderson.org', 'dana-farber.org', + 'dermcoll.edu.au', 'bad.org.uk', 'euroderm.org', + 'jaad.org', 'jidonline.org', + // Medical - Publishers & Open Access + 'wiley.com', 'onlinelibrary.wiley.com', 'springer.com', + 'karger.com', 'thieme.com', 'mdpi.com', 'frontiersin.org', + 'plos.org', 'biomedcentral.com', 'cell.com', 'elsevier.com', + // Medical - Regulatory & Evidence + 'clinicaltrials.gov', 'fda.gov', 'ema.europa.eu', + 'nice.org.uk', 'cochrane.org', + 'hopkinsmedicine.org', 'stanfordmedicine.org', + // CS - Conferences & Journals + 'arxiv.org', 'acm.org', 'dl.acm.org', 'ieee.org', + 'ieeexplore.ieee.org', 'proceedings.neurips.cc', + 'aclanthology.org', 'jmlr.org', 'aaai.org', 'ijcai.org', + 'usenix.org', 'vldb.org', 'sigmod.org', 'icml.cc', + 'cvpr.thecvf.com', 'eccv.ecva.net', 'iccv.thecvf.com', + 'openreview.net', 'paperswithcode.com', + // CS - Frameworks & Tools + 'huggingface.co', 'pytorch.org', 'tensorflow.org', + 'wandb.ai', 'mlflow.org', 'ray.io', + 'dmlc.cs.washington.edu', + // CS - Research Labs & Universities + 'cs.stanford.edu', 'cs.berkeley.edu', 'cs.cmu.edu', + 'cs.mit.edu', 'deepmind.google', 'ai.meta.com', + 'research.google', 'microsoft.com/research', + 'blog.openai.com', 'anthropic.com', +]; + function matchesDomain(url) { - return DOMAINS.some(d => url.includes(d)); + const allDomains = DOMAINS.length > 0 ? DOMAINS : DEFAULT_DOMAINS; + return allDomains.some(d => url.includes(d)); } function extractTitle(content) { @@ -38,12 +81,36 @@ function generateTags(url, content) { if (url.includes('pubmed') || url.includes('ncbi')) tags.push('pubmed', 'medical'); else if (url.includes('arxiv')) tags.push('arxiv', 'research'); else if (url.includes('who.int')) tags.push('who', 'global-health'); - else if (url.includes('cancer.org')) tags.push('cancer', 'oncology'); - else if (url.includes('dermnetnz') || url.includes('aad.org')) tags.push('dermatology'); + else if (url.includes('cancer.org') || url.includes('cancer.net') || url.includes('nccn.org')) tags.push('cancer', 'oncology'); + else if (url.includes('asco.org') || url.includes('esmo.org')) tags.push('oncology', 'clinical'); + else if (url.includes('mskcc.org') || url.includes('mdanderson.org') || url.includes('dana-farber.org')) tags.push('oncology', 'research'); + else if (url.includes('dermnetnz') || url.includes('aad.org') || url.includes('jaad.org')) tags.push('dermatology'); + else if (url.includes('dermcoll') || url.includes('bad.org.uk') || url.includes('euroderm')) tags.push('dermatology'); + else if (url.includes('jidonline')) tags.push('dermatology', 'research'); else if (url.includes('melanoma')) tags.push('melanoma', 'skin-cancer'); - else if (url.includes('acm.org') || url.includes('ieee')) tags.push('computer-science'); + else if (url.includes('clinicaltrials.gov')) tags.push('clinical-trials', 'medical'); + else if (url.includes('fda.gov') || url.includes('ema.europa.eu')) tags.push('regulatory', 'medical'); + else if (url.includes('nice.org.uk') || url.includes('cochrane.org')) tags.push('evidence-based', 'medical'); + else if (url.includes('hopkinsmedicine') || url.includes('stanfordmedicine')) tags.push('medical', 'academic'); + else if (url.includes('webmd') || url.includes('healthline') || url.includes('medscape')) tags.push('medical', 'clinical'); + else if (url.includes('uptodate.com')) tags.push('medical', 'clinical-decision'); + else if (url.includes('acm.org') || url.includes('ieee') || url.includes('dl.acm.org')) tags.push('computer-science'); + else if (url.includes('neurips') || url.includes('icml') || url.includes('aaai.org')) tags.push('ml', 'conference'); + else if (url.includes('cvpr') || url.includes('eccv') || url.includes('iccv')) tags.push('computer-vision', 'conference'); + else if (url.includes('aclanthology')) tags.push('nlp', 'conference'); + else if (url.includes('usenix') || url.includes('vldb') || url.includes('sigmod')) tags.push('systems', 'conference'); + else if (url.includes('huggingface') || url.includes('pytorch') || url.includes('tensorflow')) tags.push('ml', 'framework'); + else if (url.includes('deepmind') || url.includes('ai.meta') || url.includes('research.google')) tags.push('ml', 'research-lab'); + else if (url.includes('openai') || url.includes('anthropic')) tags.push('ml', 'research-lab'); + else if (url.includes('cs.stanford') || url.includes('cs.berkeley') || url.includes('cs.cmu') || url.includes('cs.mit')) tags.push('computer-science', 'academic'); + else if (url.includes('openreview') || url.includes('paperswithcode')) tags.push('ml', 'research'); else if (url.includes('github') || url.includes('stackoverflow')) tags.push('programming'); else if (url.includes('nature.com') || url.includes('nejm') || url.includes('lancet')) tags.push('journal', 'research'); + else if (url.includes('jamanetwork') || url.includes('annals.org') || url.includes('bmj.com')) tags.push('journal', 'medical'); + else if (url.includes('frontiersin') || url.includes('plos.org') || url.includes('biomedcentral')) tags.push('open-access', 'research'); + else if (url.includes('cell.com') || url.includes('elsevier') || url.includes('springer') || url.includes('wiley')) tags.push('journal', 'publisher'); + else if (url.includes('mdpi.com') || url.includes('karger') || url.includes('thieme')) tags.push('journal', 'publisher'); + else if (url.includes('jmlr.org') || url.includes('ijcai.org')) tags.push('ml', 'journal'); const lower = content.toLowerCase(); if (lower.includes('melanoma')) tags.push('melanoma'); diff --git a/scripts/wet-full-import.sh b/scripts/wet-full-import.sh new file mode 100755 index 00000000..ece9a9eb --- /dev/null +++ b/scripts/wet-full-import.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Full 6-year medical + CS import via WET processing +# Processes quarterly Common Crawl snapshots from 2020-2026 +set -euo pipefail + +PROJECT="${1:-ruv-dev}" +SEGS_PER_CRAWL="${2:-100}" # segments per crawl to process + +# Quarterly crawl indices (2020-2026) +CRAWLS=( + "CC-MAIN-2020-16" + "CC-MAIN-2020-50" + "CC-MAIN-2021-17" + "CC-MAIN-2021-43" + "CC-MAIN-2022-05" + "CC-MAIN-2022-33" + "CC-MAIN-2023-06" + "CC-MAIN-2023-40" + "CC-MAIN-2024-10" + "CC-MAIN-2024-42" + "CC-MAIN-2025-13" + "CC-MAIN-2025-40" + "CC-MAIN-2026-06" + "CC-MAIN-2026-08" +) + +BRAIN_URL="https://pi.ruv.io" + +echo "=== Full 6-Year Medical + CS Import ===" +echo "Crawls: ${#CRAWLS[@]}" +echo "Segments per crawl: $SEGS_PER_CRAWL" +echo "Total segments: $((${#CRAWLS[@]} * SEGS_PER_CRAWL))" +echo "" + +BEFORE=$(curl -s "$BRAIN_URL/v1/status" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['total_memories'])" 2>/dev/null || echo "0") +echo "Brain memories before: $BEFORE" +echo "" + +for crawl in "${CRAWLS[@]}"; do + echo "=== Deploying job for $crawl ===" + bash scripts/deploy-wet-job.sh "$PROJECT" "$crawl" 0 "$SEGS_PER_CRAWL" + + # Execute the job + JOB_NAME="wet-import-$(echo $crawl | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)" + gcloud run jobs execute "$JOB_NAME" --project="$PROJECT" --region=us-central1 --async 2>&1 + + echo "Job $JOB_NAME submitted (async)" + echo "" + + # Don't flood -- wait 30s between job submissions + sleep 30 +done + +echo "" +echo "=== All jobs submitted ===" +echo "Monitor with: gcloud run jobs executions list --project=$PROJECT --region=us-central1" +echo "" +echo "Check brain growth:" +echo " curl -s $BRAIN_URL/v1/status | python3 -c \"import sys,json; d=json.load(sys.stdin); print(f'Memories: {d[\\\"total_memories\\\"]}')\""