mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-25 15:03:46 +00:00
feat: Cloud Run Job deployment for full 6-year Common Crawl import
- Expanded domain list to 60+ medical + CS domains with categorized tagging - Cloud Run Job config: 10 parallel tasks, 100 segments per crawl - Multi-crawl orchestrator for 14 quarterly snapshots (2020-2026) - Enhanced generateTags with domain-specific labels for oncology, dermatology, ML conferences, research labs, and academic institutions - Target: 375K-500K medical/CS pages over 5 months Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
14ab7b0bdc
commit
045c4c574a
3 changed files with 196 additions and 4 deletions
65
scripts/deploy-wet-job.sh
Executable file
65
scripts/deploy-wet-job.sh
Executable file
|
|
@ -0,0 +1,65 @@
|
|||
#!/bin/bash
|
||||
# Deploy WET processor as Cloud Run Job for large-scale Common Crawl import
|
||||
# Usage: ./deploy-wet-job.sh [PROJECT] [CRAWL_INDEX] [START_SEGMENT] [NUM_SEGMENTS]
|
||||
set -euo pipefail
|
||||
|
||||
PROJECT="${1:-ruv-dev}"
|
||||
CRAWL_INDEX="${2:-CC-MAIN-2026-08}"
|
||||
START_SEG="${3:-0}"
|
||||
NUM_SEGS="${4:-100}"
|
||||
REGION="us-central1"
|
||||
JOB_NAME="wet-import-$(echo $CRAWL_INDEX | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
|
||||
|
||||
echo "=== WET Cloud Run Job Deployment ==="
|
||||
echo "Project: $PROJECT"
|
||||
echo "Crawl: $CRAWL_INDEX"
|
||||
echo "Segments: $START_SEG to $((START_SEG + NUM_SEGS - 1))"
|
||||
echo "Job name: $JOB_NAME"
|
||||
echo ""
|
||||
|
||||
# First, upload the filter script to GCS so the job can access it
|
||||
echo "--- Uploading filter script to GCS ---"
|
||||
gsutil cp scripts/wet-filter-inject.js gs://ruvector-brain-dev/scripts/wet-filter-inject.js 2>&1
|
||||
|
||||
# Get the WET paths file
|
||||
echo "--- Fetching WET paths ---"
|
||||
PATHS_URL="https://data.commoncrawl.org/crawl-data/${CRAWL_INDEX}/wet.paths.gz"
|
||||
curl -sL "$PATHS_URL" | gunzip | sed -n "$((START_SEG + 1)),$((START_SEG + NUM_SEGS))p" > /tmp/wet-paths-batch.txt
|
||||
ACTUAL_SEGS=$(wc -l < /tmp/wet-paths-batch.txt)
|
||||
echo "Segments to process: $ACTUAL_SEGS"
|
||||
|
||||
# Upload paths file
|
||||
gsutil cp /tmp/wet-paths-batch.txt gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>&1
|
||||
|
||||
# Build the domain list for the job command
|
||||
DOMAIN_LIST="pubmed.ncbi.nlm.nih.gov,ncbi.nlm.nih.gov,who.int,cancer.org,aad.org,dermnetnz.org,melanoma.org,arxiv.org,acm.org,ieee.org,nature.com,nejm.org,bmj.com,mayoclinic.org,clevelandclinic.org,medlineplus.gov,cdc.gov,nih.gov,thelancet.com,sciencedirect.com,webmd.com,healthline.com,medscape.com,jamanetwork.com,frontiersin.org,plos.org,biomedcentral.com,cell.com,springer.com,cochrane.org,clinicaltrials.gov,fda.gov,mskcc.org,mdanderson.org,nccn.org,dl.acm.org,ieeexplore.ieee.org,proceedings.neurips.cc,huggingface.co,pytorch.org,tensorflow.org,cs.stanford.edu,deepmind.google,research.google,microsoft.com/research,openreview.net,paperswithcode.com,asco.org,esmo.org,dana-farber.org,cancer.net,uptodate.com,wiley.com,elsevier.com,mdpi.com,plos.org,aaai.org,usenix.org,jmlr.org,aclanthology.org"
|
||||
|
||||
# Create/update the Cloud Run Job
|
||||
echo "--- Creating Cloud Run Job ---"
|
||||
gcloud run jobs create "$JOB_NAME" \
|
||||
--project="$PROJECT" \
|
||||
--region="$REGION" \
|
||||
--image="node:20-alpine" \
|
||||
--command="/bin/sh" \
|
||||
--args="-c,apk add --no-cache curl bash > /dev/null 2>&1 && gsutil cp gs://ruvector-brain-dev/scripts/wet-filter-inject.js /tmp/filter.js 2>/dev/null && WET_PATH=\$(gsutil cat gs://ruvector-brain-dev/scripts/wet-paths-batch.txt 2>/dev/null | sed -n \"\${CLOUD_RUN_TASK_INDEX:-0}p\" | head -1) && echo \"Processing: \$WET_PATH\" && curl -sL \"https://data.commoncrawl.org/\$WET_PATH\" | gunzip | node /tmp/filter.js --brain-url https://pi.ruv.io --auth 'Authorization: Bearer ruvector-crawl-2026' --batch-size 10 --crawl-index $CRAWL_INDEX --domains '$DOMAIN_LIST'" \
|
||||
--task-count="$ACTUAL_SEGS" \
|
||||
--parallelism=10 \
|
||||
--max-retries=1 \
|
||||
--cpu=1 \
|
||||
--memory=1Gi \
|
||||
--task-timeout=3600s \
|
||||
--set-env-vars="CRAWL_INDEX=$CRAWL_INDEX" \
|
||||
2>&1 || \
|
||||
gcloud run jobs update "$JOB_NAME" \
|
||||
--project="$PROJECT" \
|
||||
--region="$REGION" \
|
||||
--task-count="$ACTUAL_SEGS" \
|
||||
--parallelism=10 \
|
||||
2>&1
|
||||
|
||||
echo ""
|
||||
echo "--- Job created. To execute: ---"
|
||||
echo "gcloud run jobs execute $JOB_NAME --project=$PROJECT --region=$REGION"
|
||||
echo ""
|
||||
echo "--- To monitor: ---"
|
||||
echo "gcloud run jobs executions list --job=$JOB_NAME --project=$PROJECT --region=$REGION"
|
||||
|
|
@ -20,8 +20,51 @@ const MAX_CONTENT_LENGTH = 8000;
|
|||
const stats = { total: 0, filtered: 0, injected: 0, errors: 0, batched: 0 };
|
||||
let batch = [];
|
||||
|
||||
// Default domain list: 60+ medical + CS domains
|
||||
const DEFAULT_DOMAINS = [
|
||||
// Medical - Major Publishers & Journals
|
||||
'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'who.int',
|
||||
'nature.com', 'nejm.org', 'bmj.com', 'thelancet.com',
|
||||
'jamanetwork.com', 'annals.org', 'sciencedirect.com',
|
||||
// Medical - Clinical Resources
|
||||
'mayoclinic.org', 'clevelandclinic.org', 'medlineplus.gov',
|
||||
'cdc.gov', 'nih.gov', 'webmd.com', 'healthline.com',
|
||||
'medscape.com', 'uptodate.com',
|
||||
// Medical - Oncology & Dermatology
|
||||
'cancer.org', 'aad.org', 'dermnetnz.org', 'melanoma.org',
|
||||
'asco.org', 'esmo.org', 'nccn.org', 'cancer.net',
|
||||
'mskcc.org', 'mdanderson.org', 'dana-farber.org',
|
||||
'dermcoll.edu.au', 'bad.org.uk', 'euroderm.org',
|
||||
'jaad.org', 'jidonline.org',
|
||||
// Medical - Publishers & Open Access
|
||||
'wiley.com', 'onlinelibrary.wiley.com', 'springer.com',
|
||||
'karger.com', 'thieme.com', 'mdpi.com', 'frontiersin.org',
|
||||
'plos.org', 'biomedcentral.com', 'cell.com', 'elsevier.com',
|
||||
// Medical - Regulatory & Evidence
|
||||
'clinicaltrials.gov', 'fda.gov', 'ema.europa.eu',
|
||||
'nice.org.uk', 'cochrane.org',
|
||||
'hopkinsmedicine.org', 'stanfordmedicine.org',
|
||||
// CS - Conferences & Journals
|
||||
'arxiv.org', 'acm.org', 'dl.acm.org', 'ieee.org',
|
||||
'ieeexplore.ieee.org', 'proceedings.neurips.cc',
|
||||
'aclanthology.org', 'jmlr.org', 'aaai.org', 'ijcai.org',
|
||||
'usenix.org', 'vldb.org', 'sigmod.org', 'icml.cc',
|
||||
'cvpr.thecvf.com', 'eccv.ecva.net', 'iccv.thecvf.com',
|
||||
'openreview.net', 'paperswithcode.com',
|
||||
// CS - Frameworks & Tools
|
||||
'huggingface.co', 'pytorch.org', 'tensorflow.org',
|
||||
'wandb.ai', 'mlflow.org', 'ray.io',
|
||||
'dmlc.cs.washington.edu',
|
||||
// CS - Research Labs & Universities
|
||||
'cs.stanford.edu', 'cs.berkeley.edu', 'cs.cmu.edu',
|
||||
'cs.mit.edu', 'deepmind.google', 'ai.meta.com',
|
||||
'research.google', 'microsoft.com/research',
|
||||
'blog.openai.com', 'anthropic.com',
|
||||
];
|
||||
|
||||
function matchesDomain(url) {
|
||||
return DOMAINS.some(d => url.includes(d));
|
||||
const allDomains = DOMAINS.length > 0 ? DOMAINS : DEFAULT_DOMAINS;
|
||||
return allDomains.some(d => url.includes(d));
|
||||
}
|
||||
|
||||
function extractTitle(content) {
|
||||
|
|
@ -38,12 +81,36 @@ function generateTags(url, content) {
|
|||
if (url.includes('pubmed') || url.includes('ncbi')) tags.push('pubmed', 'medical');
|
||||
else if (url.includes('arxiv')) tags.push('arxiv', 'research');
|
||||
else if (url.includes('who.int')) tags.push('who', 'global-health');
|
||||
else if (url.includes('cancer.org')) tags.push('cancer', 'oncology');
|
||||
else if (url.includes('dermnetnz') || url.includes('aad.org')) tags.push('dermatology');
|
||||
else if (url.includes('cancer.org') || url.includes('cancer.net') || url.includes('nccn.org')) tags.push('cancer', 'oncology');
|
||||
else if (url.includes('asco.org') || url.includes('esmo.org')) tags.push('oncology', 'clinical');
|
||||
else if (url.includes('mskcc.org') || url.includes('mdanderson.org') || url.includes('dana-farber.org')) tags.push('oncology', 'research');
|
||||
else if (url.includes('dermnetnz') || url.includes('aad.org') || url.includes('jaad.org')) tags.push('dermatology');
|
||||
else if (url.includes('dermcoll') || url.includes('bad.org.uk') || url.includes('euroderm')) tags.push('dermatology');
|
||||
else if (url.includes('jidonline')) tags.push('dermatology', 'research');
|
||||
else if (url.includes('melanoma')) tags.push('melanoma', 'skin-cancer');
|
||||
else if (url.includes('acm.org') || url.includes('ieee')) tags.push('computer-science');
|
||||
else if (url.includes('clinicaltrials.gov')) tags.push('clinical-trials', 'medical');
|
||||
else if (url.includes('fda.gov') || url.includes('ema.europa.eu')) tags.push('regulatory', 'medical');
|
||||
else if (url.includes('nice.org.uk') || url.includes('cochrane.org')) tags.push('evidence-based', 'medical');
|
||||
else if (url.includes('hopkinsmedicine') || url.includes('stanfordmedicine')) tags.push('medical', 'academic');
|
||||
else if (url.includes('webmd') || url.includes('healthline') || url.includes('medscape')) tags.push('medical', 'clinical');
|
||||
else if (url.includes('uptodate.com')) tags.push('medical', 'clinical-decision');
|
||||
else if (url.includes('acm.org') || url.includes('ieee') || url.includes('dl.acm.org')) tags.push('computer-science');
|
||||
else if (url.includes('neurips') || url.includes('icml') || url.includes('aaai.org')) tags.push('ml', 'conference');
|
||||
else if (url.includes('cvpr') || url.includes('eccv') || url.includes('iccv')) tags.push('computer-vision', 'conference');
|
||||
else if (url.includes('aclanthology')) tags.push('nlp', 'conference');
|
||||
else if (url.includes('usenix') || url.includes('vldb') || url.includes('sigmod')) tags.push('systems', 'conference');
|
||||
else if (url.includes('huggingface') || url.includes('pytorch') || url.includes('tensorflow')) tags.push('ml', 'framework');
|
||||
else if (url.includes('deepmind') || url.includes('ai.meta') || url.includes('research.google')) tags.push('ml', 'research-lab');
|
||||
else if (url.includes('openai') || url.includes('anthropic')) tags.push('ml', 'research-lab');
|
||||
else if (url.includes('cs.stanford') || url.includes('cs.berkeley') || url.includes('cs.cmu') || url.includes('cs.mit')) tags.push('computer-science', 'academic');
|
||||
else if (url.includes('openreview') || url.includes('paperswithcode')) tags.push('ml', 'research');
|
||||
else if (url.includes('github') || url.includes('stackoverflow')) tags.push('programming');
|
||||
else if (url.includes('nature.com') || url.includes('nejm') || url.includes('lancet')) tags.push('journal', 'research');
|
||||
else if (url.includes('jamanetwork') || url.includes('annals.org') || url.includes('bmj.com')) tags.push('journal', 'medical');
|
||||
else if (url.includes('frontiersin') || url.includes('plos.org') || url.includes('biomedcentral')) tags.push('open-access', 'research');
|
||||
else if (url.includes('cell.com') || url.includes('elsevier') || url.includes('springer') || url.includes('wiley')) tags.push('journal', 'publisher');
|
||||
else if (url.includes('mdpi.com') || url.includes('karger') || url.includes('thieme')) tags.push('journal', 'publisher');
|
||||
else if (url.includes('jmlr.org') || url.includes('ijcai.org')) tags.push('ml', 'journal');
|
||||
|
||||
const lower = content.toLowerCase();
|
||||
if (lower.includes('melanoma')) tags.push('melanoma');
|
||||
|
|
|
|||
60
scripts/wet-full-import.sh
Executable file
60
scripts/wet-full-import.sh
Executable file
|
|
@ -0,0 +1,60 @@
|
|||
#!/bin/bash
|
||||
# Full 6-year medical + CS import via WET processing
|
||||
# Processes quarterly Common Crawl snapshots from 2020-2026
|
||||
set -euo pipefail
|
||||
|
||||
PROJECT="${1:-ruv-dev}"
|
||||
SEGS_PER_CRAWL="${2:-100}" # segments per crawl to process
|
||||
|
||||
# Quarterly crawl indices (2020-2026)
|
||||
CRAWLS=(
|
||||
"CC-MAIN-2020-16"
|
||||
"CC-MAIN-2020-50"
|
||||
"CC-MAIN-2021-17"
|
||||
"CC-MAIN-2021-43"
|
||||
"CC-MAIN-2022-05"
|
||||
"CC-MAIN-2022-33"
|
||||
"CC-MAIN-2023-06"
|
||||
"CC-MAIN-2023-40"
|
||||
"CC-MAIN-2024-10"
|
||||
"CC-MAIN-2024-42"
|
||||
"CC-MAIN-2025-13"
|
||||
"CC-MAIN-2025-40"
|
||||
"CC-MAIN-2026-06"
|
||||
"CC-MAIN-2026-08"
|
||||
)
|
||||
|
||||
BRAIN_URL="https://pi.ruv.io"
|
||||
|
||||
echo "=== Full 6-Year Medical + CS Import ==="
|
||||
echo "Crawls: ${#CRAWLS[@]}"
|
||||
echo "Segments per crawl: $SEGS_PER_CRAWL"
|
||||
echo "Total segments: $((${#CRAWLS[@]} * SEGS_PER_CRAWL))"
|
||||
echo ""
|
||||
|
||||
BEFORE=$(curl -s "$BRAIN_URL/v1/status" \
|
||||
| python3 -c "import sys,json; print(json.load(sys.stdin)['total_memories'])" 2>/dev/null || echo "0")
|
||||
echo "Brain memories before: $BEFORE"
|
||||
echo ""
|
||||
|
||||
for crawl in "${CRAWLS[@]}"; do
|
||||
echo "=== Deploying job for $crawl ==="
|
||||
bash scripts/deploy-wet-job.sh "$PROJECT" "$crawl" 0 "$SEGS_PER_CRAWL"
|
||||
|
||||
# Execute the job
|
||||
JOB_NAME="wet-import-$(echo $crawl | tr '[:upper:]' '[:lower:]' | tr -d '-' | tail -c 8)"
|
||||
gcloud run jobs execute "$JOB_NAME" --project="$PROJECT" --region=us-central1 --async 2>&1
|
||||
|
||||
echo "Job $JOB_NAME submitted (async)"
|
||||
echo ""
|
||||
|
||||
# Don't flood -- wait 30s between job submissions
|
||||
sleep 30
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== All jobs submitted ==="
|
||||
echo "Monitor with: gcloud run jobs executions list --project=$PROJECT --region=us-central1"
|
||||
echo ""
|
||||
echo "Check brain growth:"
|
||||
echo " curl -s $BRAIN_URL/v1/status | python3 -c \"import sys,json; d=json.load(sys.stdin); print(f'Memories: {d[\\\"total_memories\\\"]}')\""
|
||||
Loading…
Add table
Add a link
Reference in a new issue