ruvector/scripts/wet-processor.sh

#!/bin/bash
# Common Crawl WET Processor -- Medical + CS Corpus Import
# Processes pre-extracted text (no HTML parsing needed)
# Usage: ./wet-processor.sh [CRAWL_INDEX] [SEGMENT_NUM]
set -euo pipefail

CRAWL_INDEX="${1:-CC-MAIN-2026-08}"
SEGMENT_NUM="${2:-0}"
BRAIN_URL="${BRAIN_URL:-https://pi.ruv.io}"
AUTH="Authorization: Bearer ruvector-crawl-2026"
WORK_DIR="/tmp/wet-processing"
BATCH_SIZE=10  # items per batch inject call

# Medical + CS domains to filter for
DOMAINS=(
  "pubmed.ncbi.nlm.nih.gov"
  "ncbi.nlm.nih.gov"
  "who.int"
  "cancer.org"
  "aad.org"
  "skincancer.org"
  "dermnetnz.org"
  "melanoma.org"
  "mayoclinic.org"
  "clevelandclinic.org"
  "medlineplus.gov"
  "cdc.gov"
  "nih.gov"
  "nejm.org"
  "thelancet.com"
  "bmj.com"
  "nature.com/articles"
  "sciencedirect.com"
  "arxiv.org"
  "acm.org"
  "ieee.org"
  "dl.acm.org"
  "proceedings.mlr.press"
  "openreview.net"
  "paperswithcode.com"
  "github.com"
  "stackoverflow.com"
  "medium.com"
  "towardsdatascience.com"
  "distill.pub"
)

mkdir -p "$WORK_DIR"

echo "=== WET Processor ==="
echo "Crawl: $CRAWL_INDEX"
echo "Domains: ${#DOMAINS[@]}"
echo ""

# Step 1: Get WET file list for this crawl
echo "--- Fetching WET file paths ---"
PATHS_URL="https://data.commoncrawl.org/crawl-data/${CRAWL_INDEX}/wet.paths.gz"
curl -sL "$PATHS_URL" | gunzip > "$WORK_DIR/wet-paths.txt" 2>/dev/null || {
  echo "ERROR: Could not fetch WET paths for $CRAWL_INDEX"
  exit 1
}

TOTAL_SEGMENTS=$(wc -l < "$WORK_DIR/wet-paths.txt")
echo "Total WET segments: $TOTAL_SEGMENTS"

# Select segment to process
WET_PATH=$(sed -n "$((SEGMENT_NUM + 1))p" "$WORK_DIR/wet-paths.txt")
if [ -z "$WET_PATH" ]; then
  echo "ERROR: Segment $SEGMENT_NUM not found"
  exit 1
fi

echo "Processing segment $SEGMENT_NUM: $WET_PATH"

# Step 2: Download and decompress WET file
echo "--- Downloading WET segment ---"
WET_FILE="$WORK_DIR/segment.wet.gz"
curl -sL "https://data.commoncrawl.org/$WET_PATH" -o "$WET_FILE" --max-time 300 || {
  echo "ERROR: Download failed"
  exit 1
}

echo "Downloaded: $(du -h "$WET_FILE" | cut -f1)"

# Step 3: Extract, filter by domain, and inject
echo "--- Processing and filtering ---"
gunzip -c "$WET_FILE" | node "$(dirname "$0")/wet-filter-inject.js" \
  --brain-url "$BRAIN_URL" \
  --auth "$AUTH" \
  --batch-size "$BATCH_SIZE" \
  --domains "$(IFS=,; echo "${DOMAINS[*]}")" \
  --crawl-index "$CRAWL_INDEX"

# Cleanup
rm -f "$WET_FILE"
echo ""
echo "=== Done ==="