feat: Cloud Run Job deployment for full 6-year Common Crawl import

- Expanded domain list to 60+ medical + CS domains with categorized tagging
- Cloud Run Job config: 10 parallel tasks, 100 segments per crawl
- Multi-crawl orchestrator for 14 quarterly snapshots (2020-2026)
- Enhanced generateTags with domain-specific labels for oncology, dermatology,
  ML conferences, research labs, and academic institutions
- Target: 375K-500K medical/CS pages over 5 months

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
rUv 2026-03-22 00:57:58 +00:00
parent 14ab7b0bdc
commit 045c4c574a
3 changed files with 196 additions and 4 deletions

65
scripts/deploy-wet-job.sh Executable file
View file

@ -0,0 +1,65 @@
#!/bin/bash
# Deploy WET processor as Cloud Run Job for large-scale Common Crawl import
# Usage: ./deploy-wet-job.sh [PROJECT] [CRAWL_INDEX] [START_SEGMENT] [NUM_SEGMENTS]
set -euo pipefail
PROJECT="${1:-ruv-dev}"
CRAWL_INDEX="${2:-CC-MAIN-2026-08}"
START_SEG="${3:-0}"
NUM_SEGS="${4:-100}"
REGION="us-central1"
JOB_NAME="wet-import-$(echo $CRAWL_INDEX | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
echo "=== WET Cloud Run Job Deployment ==="
echo "Project: $PROJECT"
echo "Crawl: $CRAWL_INDEX"
echo "Segments: $START_SEG to $((START_SEG + NUM_SEGS - 1))"
echo "Job name: $JOB_NAME"
echo ""
# First, upload the filter script to GCS so the job can access it
echo "--- Uploading filter script to GCS ---"
gsutil cp scripts/wet-filter-inject.js gs://ruvector-brain-dev/scripts/wet-filter-inject.js 2>&1
# Get the WET paths file
echo "--- Fetching WET paths ---"
PATHS_URL="https://data.commoncrawl.org/crawl-data/${CRAWL_INDEX}/wet.paths.gz"
curl -sL "$PATHS_URL" | gunzip | sed -n "$((START_SEG + 1)),$((START_SEG + NUM_SEGS))p" > /tmp/wet-paths-batch.txt
ACTUAL_SEGS=$(wc -l < /tmp/wet-paths-batch.txt)
echo "Segments to process: $ACTUAL_SEGS"
# Upload paths file
gsutil cp /tmp/wet-paths-batch.txt gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>&1
# Build the domain list for the job command
DOMAIN_LIST="pubmed.ncbi.nlm.nih.gov,ncbi.nlm.nih.gov,who.int,cancer.org,aad.org,dermnetnz.org,melanoma.org,arxiv.org,acm.org,ieee.org,nature.com,nejm.org,bmj.com,mayoclinic.org,clevelandclinic.org,medlineplus.gov,cdc.gov,nih.gov,thelancet.com,sciencedirect.com,webmd.com,healthline.com,medscape.com,jamanetwork.com,frontiersin.org,plos.org,biomedcentral.com,cell.com,springer.com,cochrane.org,clinicaltrials.gov,fda.gov,mskcc.org,mdanderson.org,nccn.org,dl.acm.org,ieeexplore.ieee.org,proceedings.neurips.cc,huggingface.co,pytorch.org,tensorflow.org,cs.stanford.edu,deepmind.google,research.google,microsoft.com/research,openreview.net,paperswithcode.com,asco.org,esmo.org,dana-farber.org,cancer.net,uptodate.com,wiley.com,elsevier.com,mdpi.com,plos.org,aaai.org,usenix.org,jmlr.org,aclanthology.org"
# Create/update the Cloud Run Job
echo "--- Creating Cloud Run Job ---"
gcloud run jobs create "$JOB_NAME" \
--project="$PROJECT" \
--region="$REGION" \
--image="node:20-alpine" \
--command="/bin/sh" \
--args="-c,apk add --no-cache curl bash > /dev/null 2>&1 && gsutil cp gs://ruvector-brain-dev/scripts/wet-filter-inject.js /tmp/filter.js 2>/dev/null && WET_PATH=\$(gsutil cat gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>/dev/null | sed -n \"\${CLOUD_RUN_TASK_INDEX:-0}p\" | head -1) && echo \"Processing: \$WET_PATH\" && curl -sL \"https://data.commoncrawl.org/\$WET_PATH\" | gunzip | node /tmp/filter.js --brain-url https://pi.ruv.io --auth 'Authorization: Bearer ruvector-crawl-2026' --batch-size 10 --crawl-index $CRAWL_INDEX --domains '$DOMAIN_LIST'" \
--task-count="$ACTUAL_SEGS" \
--parallelism=10 \
--max-retries=1 \
--cpu=1 \
--memory=1Gi \
--task-timeout=3600s \
--set-env-vars="CRAWL_INDEX=$CRAWL_INDEX" \
2>&1 || \
gcloud run jobs update "$JOB_NAME" \
--project="$PROJECT" \
--region="$REGION" \
--task-count="$ACTUAL_SEGS" \
--parallelism=10 \
2>&1
echo ""
echo "--- Job created. To execute: ---"
echo "gcloud run jobs execute $JOB_NAME --project=$PROJECT --region=$REGION"
echo ""
echo "--- To monitor: ---"
echo "gcloud run jobs executions list --job=$JOB_NAME --project=$PROJECT --region=$REGION"

View file

@ -20,8 +20,51 @@ const MAX_CONTENT_LENGTH = 8000;
const stats = { total: 0, filtered: 0, injected: 0, errors: 0, batched: 0 };
let batch = [];
// Default domain list: 60+ medical + CS domains
const DEFAULT_DOMAINS = [
// Medical - Major Publishers & Journals
'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'who.int',
'nature.com', 'nejm.org', 'bmj.com', 'thelancet.com',
'jamanetwork.com', 'annals.org', 'sciencedirect.com',
// Medical - Clinical Resources
'mayoclinic.org', 'clevelandclinic.org', 'medlineplus.gov',
'cdc.gov', 'nih.gov', 'webmd.com', 'healthline.com',
'medscape.com', 'uptodate.com',
// Medical - Oncology & Dermatology
'cancer.org', 'aad.org', 'dermnetnz.org', 'melanoma.org',
'asco.org', 'esmo.org', 'nccn.org', 'cancer.net',
'mskcc.org', 'mdanderson.org', 'dana-farber.org',
'dermcoll.edu.au', 'bad.org.uk', 'euroderm.org',
'jaad.org', 'jidonline.org',
// Medical - Publishers & Open Access
'wiley.com', 'onlinelibrary.wiley.com', 'springer.com',
'karger.com', 'thieme.com', 'mdpi.com', 'frontiersin.org',
'plos.org', 'biomedcentral.com', 'cell.com', 'elsevier.com',
// Medical - Regulatory & Evidence
'clinicaltrials.gov', 'fda.gov', 'ema.europa.eu',
'nice.org.uk', 'cochrane.org',
'hopkinsmedicine.org', 'stanfordmedicine.org',
// CS - Conferences & Journals
'arxiv.org', 'acm.org', 'dl.acm.org', 'ieee.org',
'ieeexplore.ieee.org', 'proceedings.neurips.cc',
'aclanthology.org', 'jmlr.org', 'aaai.org', 'ijcai.org',
'usenix.org', 'vldb.org', 'sigmod.org', 'icml.cc',
'cvpr.thecvf.com', 'eccv.ecva.net', 'iccv.thecvf.com',
'openreview.net', 'paperswithcode.com',
// CS - Frameworks & Tools
'huggingface.co', 'pytorch.org', 'tensorflow.org',
'wandb.ai', 'mlflow.org', 'ray.io',
'dmlc.cs.washington.edu',
// CS - Research Labs & Universities
'cs.stanford.edu', 'cs.berkeley.edu', 'cs.cmu.edu',
'cs.mit.edu', 'deepmind.google', 'ai.meta.com',
'research.google', 'microsoft.com/research',
'blog.openai.com', 'anthropic.com',
];
function matchesDomain(url) {
return DOMAINS.some(d => url.includes(d));
const allDomains = DOMAINS.length > 0 ? DOMAINS : DEFAULT_DOMAINS;
return allDomains.some(d => url.includes(d));
}
function extractTitle(content) {
@ -38,12 +81,36 @@ function generateTags(url, content) {
if (url.includes('pubmed') || url.includes('ncbi')) tags.push('pubmed', 'medical');
else if (url.includes('arxiv')) tags.push('arxiv', 'research');
else if (url.includes('who.int')) tags.push('who', 'global-health');
else if (url.includes('cancer.org')) tags.push('cancer', 'oncology');
else if (url.includes('dermnetnz') || url.includes('aad.org')) tags.push('dermatology');
else if (url.includes('cancer.org') || url.includes('cancer.net') || url.includes('nccn.org')) tags.push('cancer', 'oncology');
else if (url.includes('asco.org') || url.includes('esmo.org')) tags.push('oncology', 'clinical');
else if (url.includes('mskcc.org') || url.includes('mdanderson.org') || url.includes('dana-farber.org')) tags.push('oncology', 'research');
else if (url.includes('dermnetnz') || url.includes('aad.org') || url.includes('jaad.org')) tags.push('dermatology');
else if (url.includes('dermcoll') || url.includes('bad.org.uk') || url.includes('euroderm')) tags.push('dermatology');
else if (url.includes('jidonline')) tags.push('dermatology', 'research');
else if (url.includes('melanoma')) tags.push('melanoma', 'skin-cancer');
else if (url.includes('acm.org') || url.includes('ieee')) tags.push('computer-science');
else if (url.includes('clinicaltrials.gov')) tags.push('clinical-trials', 'medical');
else if (url.includes('fda.gov') || url.includes('ema.europa.eu')) tags.push('regulatory', 'medical');
else if (url.includes('nice.org.uk') || url.includes('cochrane.org')) tags.push('evidence-based', 'medical');
else if (url.includes('hopkinsmedicine') || url.includes('stanfordmedicine')) tags.push('medical', 'academic');
else if (url.includes('webmd') || url.includes('healthline') || url.includes('medscape')) tags.push('medical', 'clinical');
else if (url.includes('uptodate.com')) tags.push('medical', 'clinical-decision');
else if (url.includes('acm.org') || url.includes('ieee') || url.includes('dl.acm.org')) tags.push('computer-science');
else if (url.includes('neurips') || url.includes('icml') || url.includes('aaai.org')) tags.push('ml', 'conference');
else if (url.includes('cvpr') || url.includes('eccv') || url.includes('iccv')) tags.push('computer-vision', 'conference');
else if (url.includes('aclanthology')) tags.push('nlp', 'conference');
else if (url.includes('usenix') || url.includes('vldb') || url.includes('sigmod')) tags.push('systems', 'conference');
else if (url.includes('huggingface') || url.includes('pytorch') || url.includes('tensorflow')) tags.push('ml', 'framework');
else if (url.includes('deepmind') || url.includes('ai.meta') || url.includes('research.google')) tags.push('ml', 'research-lab');
else if (url.includes('openai') || url.includes('anthropic')) tags.push('ml', 'research-lab');
else if (url.includes('cs.stanford') || url.includes('cs.berkeley') || url.includes('cs.cmu') || url.includes('cs.mit')) tags.push('computer-science', 'academic');
else if (url.includes('openreview') || url.includes('paperswithcode')) tags.push('ml', 'research');
else if (url.includes('github') || url.includes('stackoverflow')) tags.push('programming');
else if (url.includes('nature.com') || url.includes('nejm') || url.includes('lancet')) tags.push('journal', 'research');
else if (url.includes('jamanetwork') || url.includes('annals.org') || url.includes('bmj.com')) tags.push('journal', 'medical');
else if (url.includes('frontiersin') || url.includes('plos.org') || url.includes('biomedcentral')) tags.push('open-access', 'research');
else if (url.includes('cell.com') || url.includes('elsevier') || url.includes('springer') || url.includes('wiley')) tags.push('journal', 'publisher');
else if (url.includes('mdpi.com') || url.includes('karger') || url.includes('thieme')) tags.push('journal', 'publisher');
else if (url.includes('jmlr.org') || url.includes('ijcai.org')) tags.push('ml', 'journal');
const lower = content.toLowerCase();
if (lower.includes('melanoma')) tags.push('melanoma');

60
scripts/wet-full-import.sh Executable file
View file

@ -0,0 +1,60 @@
#!/bin/bash
# Full 6-year medical + CS import via WET processing
# Processes quarterly Common Crawl snapshots from 2020-2026
set -euo pipefail
PROJECT="${1:-ruv-dev}"
SEGS_PER_CRAWL="${2:-100}" # segments per crawl to process
# Quarterly crawl indices (2020-2026)
CRAWLS=(
"CC-MAIN-2020-16"
"CC-MAIN-2020-50"
"CC-MAIN-2021-17"
"CC-MAIN-2021-43"
"CC-MAIN-2022-05"
"CC-MAIN-2022-33"
"CC-MAIN-2023-06"
"CC-MAIN-2023-40"
"CC-MAIN-2024-10"
"CC-MAIN-2024-42"
"CC-MAIN-2025-13"
"CC-MAIN-2025-40"
"CC-MAIN-2026-06"
"CC-MAIN-2026-08"
)
BRAIN_URL="https://pi.ruv.io"
echo "=== Full 6-Year Medical + CS Import ==="
echo "Crawls: ${#CRAWLS[@]}"
echo "Segments per crawl: $SEGS_PER_CRAWL"
echo "Total segments: $((${#CRAWLS[@]} * SEGS_PER_CRAWL))"
echo ""
BEFORE=$(curl -s "$BRAIN_URL/v1/status" \
| python3 -c "import sys,json; print(json.load(sys.stdin)['total_memories'])" 2>/dev/null || echo "0")
echo "Brain memories before: $BEFORE"
echo ""
for crawl in "${CRAWLS[@]}"; do
echo "=== Deploying job for $crawl ==="
bash scripts/deploy-wet-job.sh "$PROJECT" "$crawl" 0 "$SEGS_PER_CRAWL"
# Execute the job
JOB_NAME="wet-import-$(echo $crawl | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
gcloud run jobs execute "$JOB_NAME" --project="$PROJECT" --region=us-central1 --async 2>&1
echo "Job $JOB_NAME submitted (async)"
echo ""
# Don't flood -- wait 30s between job submissions
sleep 30
done
echo ""
echo "=== All jobs submitted ==="
echo "Monitor with: gcloud run jobs executions list --project=$PROJECT --region=us-central1"
echo ""
echo "Check brain growth:"
echo " curl -s $BRAIN_URL/v1/status | python3 -c \"import sys,json; d=json.load(sys.stdin); print(f'Memories: {d[\\\"total_memories\\\"]}')\""