mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-30 20:43:38 +00:00
Bypasses broken CDX HTML extractor by processing pre-extracted text from Common Crawl WET files. Filters by 30 medical + CS domains, chunks content, and batch injects into pi.ruv.io brain. Includes: processor, filter/injector, Cloud Run Job config, orchestrator for multi-segment processing. Target: full corpus in 6 weeks at ~$200 total cost. Co-Authored-By: claude-flow <ruv@ruv.net>
53 lines
1.7 KiB
Bash
Executable file
53 lines
1.7 KiB
Bash
Executable file
#!/bin/bash
|
|
# Orchestrate WET processing across multiple segments
|
|
# Usage: ./wet-orchestrate.sh [CRAWL_INDEX] [START_SEGMENT] [NUM_SEGMENTS]
|
|
set -euo pipefail
|
|
|
|
CRAWL_INDEX="${1:-CC-MAIN-2026-08}"
|
|
START="${2:-0}"
|
|
COUNT="${3:-5}" # Process 5 segments by default
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
BRAIN_URL="${BRAIN_URL:-https://pi.ruv.io}"
|
|
|
|
echo "=== WET Orchestrator ==="
|
|
echo "Crawl: $CRAWL_INDEX"
|
|
echo "Segments: $START to $((START + COUNT - 1))"
|
|
echo ""
|
|
|
|
# Record starting state
|
|
BEFORE=$(curl -s "$BRAIN_URL/v1/status" \
|
|
| python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('total_memories', 0))" 2>/dev/null || echo "0")
|
|
echo "Brain memories before: $BEFORE"
|
|
echo ""
|
|
|
|
for i in $(seq "$START" "$((START + COUNT - 1))"); do
|
|
echo "=== Segment $i ==="
|
|
bash "$SCRIPT_DIR/wet-processor.sh" "$CRAWL_INDEX" "$i" 2>&1 || {
|
|
echo "Segment $i failed, continuing..."
|
|
}
|
|
|
|
# Brief pause between segments
|
|
sleep 5
|
|
|
|
# Check brain growth
|
|
CURRENT=$(curl -s "$BRAIN_URL/v1/status" \
|
|
| python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('total_memories', 0))" 2>/dev/null || echo "0")
|
|
echo "Brain memories: $CURRENT (+$((CURRENT - BEFORE)) total)"
|
|
echo ""
|
|
done
|
|
|
|
# Final report
|
|
echo "--- Final Report ---"
|
|
curl -s "$BRAIN_URL/v1/status" | python3 -c "
|
|
import sys, json
|
|
try:
|
|
d = json.load(sys.stdin)
|
|
print(f'Final state:')
|
|
print(f' Memories: {d.get(\"total_memories\", \"N/A\")}')
|
|
print(f' Graph: {d.get(\"graph_edges\", \"N/A\")} edges')
|
|
print(f' Sparsifier: {d.get(\"sparsifier_compression\", 0):.1f}x')
|
|
except Exception as e:
|
|
print(f'Could not fetch final status: {e}')
|
|
" 2>/dev/null || echo "Could not fetch final brain status"
|
|
echo ""
|
|
echo "=== Orchestration Complete ==="
|