feat: ADR-119 historical crawl evolutionary comparison

Implement temporal knowledge evolution tracking across quarterly Common Crawl snapshots (2020-2026). Includes: - ADR-119 with architecture, cost model, acceptance criteria - Historical crawl import script (14 quarterly snapshots, 5 domains) - Evolutionary analysis module (drift detection, concept birth, similarity) - Initial analysis report on existing brain content (71 memories) Cost: ~$7-15 one-time for full 2020-2026 import. Co-Authored-By: claude-flow <ruv@ruv.net>
2026-05-25 23:24:03 +00:00 · 2026-03-22 00:28:13 +00:00 · 2026-03-22 00:28:13 +00:00 · 1ab5240956
commit 1ab5240956
parent a81c13514c
4 changed files with 425 additions and 0 deletions
--- a/scripts/analyze-evolution.js
+++ b/scripts/analyze-evolution.js
@ -0,0 +1,154 @@
+#!/usr/bin/env node
+// Historical crawl evolutionary analysis
+// Queries brain for temporal medical content and computes drift metrics
+// ADR-119 implementation
+
+const BRAIN_URL = process.env.BRAIN_URL || 'https://pi.ruv.io';
+const AUTH = 'Bearer ruvector-crawl-2026';
+const fs = require('fs');
+const path = require('path');
+
+async function fetchBrain(urlPath) {
+  const res = await fetch(`${BRAIN_URL}${urlPath}`, {
+    headers: { 'Authorization': AUTH }
+  });
+  if (!res.ok) throw new Error(`${urlPath}: ${res.status}`);
+  return res.json();
+}
+
+async function searchMedical(query, limit = 50) {
+  return fetchBrain(`/v1/memories/search?q=${encodeURIComponent(query)}&limit=${limit}`);
+}
+
+function cosineSim(a, b) {
+  if (!a || !b || a.length !== b.length) return 0;
+  let dot = 0, magA = 0, magB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    magA += a[i] * a[i];
+    magB += b[i] * b[i];
+  }
+  return dot / (Math.sqrt(magA) * Math.sqrt(magB) || 1);
+}
+
+async function main() {
+  console.log('=== Historical Crawl Evolutionary Analysis ===\n');
+
+  // Search for medical content across domains
+  const domains = ['dermatology', 'melanoma', 'skin cancer', 'dermoscopy', 'basal cell carcinoma'];
+  const allMemories = [];
+
+  for (const domain of domains) {
+    try {
+      const results = await searchMedical(domain, 20);
+      const memories = Array.isArray(results) ? results : results.memories || [];
+      allMemories.push(...memories);
+      console.log(`  ${domain}: ${memories.length} results`);
+    } catch (err) {
+      console.log(`  ${domain}: error - ${err.message}`);
+    }
+  }
+
+  // Deduplicate by ID
+  const seen = new Set();
+  const unique = allMemories.filter(m => {
+    if (seen.has(m.id)) return false;
+    seen.add(m.id);
+    return true;
+  });
+
+  console.log(`\nTotal unique memories: ${unique.length}`);
+
+  // Analyze by creation date
+  const byMonth = {};
+  for (const m of unique) {
+    const month = (m.created_at || '').slice(0, 7); // YYYY-MM
+    if (!byMonth[month]) byMonth[month] = [];
+    byMonth[month].push(m);
+  }
+
+  // Compute embedding similarity matrix for drift detection
+  const embeddings = unique.filter(m => m.embedding && m.embedding.length > 0);
+  const driftPairs = [];
+
+  for (let i = 0; i < Math.min(embeddings.length, 50); i++) {
+    for (let j = i + 1; j < Math.min(embeddings.length, 50); j++) {
+      const sim = cosineSim(embeddings[i].embedding, embeddings[j].embedding);
+      if (sim > 0.7) {
+        driftPairs.push({
+          a: embeddings[i].title,
+          b: embeddings[j].title,
+          similarity: sim,
+          aDate: embeddings[i].created_at,
+          bDate: embeddings[j].created_at,
+        });
+      }
+    }
+  }
+
+  driftPairs.sort((a, b) => b.similarity - a.similarity);
+
+  // Generate report
+  let report = `# Historical Crawl Evolutionary Analysis\n\n`;
+  report += `**Date**: ${new Date().toISOString().slice(0, 10)}\n`;
+  report += `**Memories analyzed**: ${unique.length}\n`;
+  report += `**Embedding pairs computed**: ${driftPairs.length}\n\n`;
+
+  report += `## Knowledge Distribution by Month\n\n`;
+  report += `| Month | Memories | Topics |\n|-------|----------|--------|\n`;
+  for (const [month, mems] of Object.entries(byMonth).sort()) {
+    const topics = [...new Set(mems.flatMap(m => (m.tags || []).slice(0, 3)))].slice(0, 5).join(', ');
+    report += `| ${month} | ${mems.length} | ${topics} |\n`;
+  }
+
+  report += `\n## Most Similar Content Pairs (Potential Temporal Versions)\n\n`;
+  report += `| Similarity | Content A | Content B |\n|-----------|-----------|----------|\n`;
+  for (const pair of driftPairs.slice(0, 15)) {
+    report += `| ${pair.similarity.toFixed(3)} | ${(pair.a || '?').slice(0, 40)} | ${(pair.b || '?').slice(0, 40)} |\n`;
+  }
+
+  report += `\n## Topic Clusters\n\n`;
+  const tagCounts = {};
+  for (const m of unique) {
+    for (const tag of (m.tags || [])) {
+      tagCounts[tag] = (tagCounts[tag] || 0) + 1;
+    }
+  }
+  const topTags = Object.entries(tagCounts).sort((a, b) => b[1] - a[1]).slice(0, 20);
+  report += `| Tag | Count |\n|-----|-------|\n`;
+  for (const [tag, count] of topTags) {
+    report += `| ${tag} | ${count} |\n`;
+  }
+
+  report += `\n## Key Findings\n\n`;
+  report += `- Total medical knowledge memories: ${unique.length}\n`;
+  report += `- High-similarity pairs (>0.7): ${driftPairs.length} (potential temporal versions or related content)\n`;
+  report += `- Most common topic: ${topTags[0] ? topTags[0][0] : 'N/A'} (${topTags[0] ? topTags[0][1] : 0} memories)\n`;
+  report += `- Date range: ${Object.keys(byMonth).sort()[0] || 'N/A'} to ${Object.keys(byMonth).sort().pop() || 'N/A'}\n`;
+
+  // Write report
+  const outDir = path.join(__dirname, '..', 'docs', 'research', 'DrAgnes');
+  const outPath = path.join(outDir, 'evolution-analysis.md');
+  fs.mkdirSync(outDir, { recursive: true });
+  fs.writeFileSync(outPath, report);
+  console.log(`\nReport written to: ${outPath}`);
+
+  // Share summary to brain
+  try {
+    const shareRes = await fetch(`${BRAIN_URL}/v1/memories`, {
+      method: 'POST',
+      headers: { 'Authorization': AUTH, 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        title: `Evolutionary Analysis: ${unique.length} medical memories across ${Object.keys(byMonth).length} months`,
+        content: `Historical crawl analysis found ${unique.length} unique medical memories with ${driftPairs.length} high-similarity pairs (potential temporal versions). Top topics: ${topTags.slice(0, 5).map(t => t[0]).join(', ')}. Date range: ${Object.keys(byMonth).sort()[0]} to ${Object.keys(byMonth).sort().pop()}.`,
+        tags: ['evolution', 'historical-crawl', 'drift-analysis', 'medical', 'temporal'],
+        category: 'pattern'
+      })
+    });
+    if (shareRes.ok) console.log('Shared analysis to brain');
+  } catch (err) {
+    console.log('Brain share failed:', err.message);
+  }
+}
+
+main().catch(console.error);
--- a/scripts/historical-crawl-import.sh
+++ b/scripts/historical-crawl-import.sh
@ -0,0 +1,100 @@
+#!/bin/bash
+# Historical Common Crawl evolutionary comparison importer
+# Queries the same medical domains across quarterly crawl snapshots 2020-2026
+# ADR-119 implementation
+set -euo pipefail
+
+BRAIN_URL="${BRAIN_URL:-https://pi.ruv.io}"
+AUTH_HEADER="Authorization: Bearer ruvector-crawl-2026"
+LIMIT="${LIMIT:-50}"  # pages per domain per crawl
+
+# Target domains for medical/dermatology evolution tracking
+DOMAINS=(
+  "aad.org"
+  "dermnetnz.org"
+  "skincancer.org"
+  "cancer.org"
+  "melanoma.org"
+)
+
+# Quarterly crawl snapshots (2020-2026)
+CRAWLS=(
+  "CC-MAIN-2020-16"
+  "CC-MAIN-2020-50"
+  "CC-MAIN-2021-17"
+  "CC-MAIN-2021-43"
+  "CC-MAIN-2022-05"
+  "CC-MAIN-2022-33"
+  "CC-MAIN-2023-06"
+  "CC-MAIN-2023-40"
+  "CC-MAIN-2024-10"
+  "CC-MAIN-2024-42"
+  "CC-MAIN-2025-13"
+  "CC-MAIN-2025-40"
+  "CC-MAIN-2026-06"
+  "CC-MAIN-2026-08"
+)
+
+echo "=== Historical Common Crawl Evolutionary Import ==="
+echo "Brain: ${BRAIN_URL}"
+echo "Domains: ${#DOMAINS[@]}"
+echo "Crawls: ${#CRAWLS[@]} quarterly snapshots (2020-2026)"
+echo "Limit: ${LIMIT} pages/domain/crawl"
+echo ""
+
+TOTAL_IMPORTED=0
+TOTAL_ERRORS=0
+
+for crawl in "${CRAWLS[@]}"; do
+  echo "--- Crawl: ${crawl} ---"
+
+  for domain in "${DOMAINS[@]}"; do
+    echo -n "  ${domain}: "
+
+    # Call the brain's crawl discover endpoint
+    RESULT=$(curl -s -X POST "${BRAIN_URL}/v1/pipeline/crawl/discover" \
+      -H "Content-Type: application/json" \
+      -H "${AUTH_HEADER}" \
+      -d "{
+        \"domain_pattern\": \"*.${domain}/*\",
+        \"crawl_index\": \"${crawl}\",
+        \"limit\": ${LIMIT},
+        \"filters\": {\"language\": \"en\", \"min_length\": 500}
+      }" \
+      --max-time 60 2>/dev/null || echo '{"error":"timeout"}')
+
+    # Parse result
+    DISCOVERED=$(echo "$RESULT" | python3 -c "
+import sys,json
+try:
+    d=json.load(sys.stdin)
+    if 'error' in d:
+        print(f'ERROR: {d[\"error\"]}')
+    else:
+        count = d.get('discovered', d.get('total', d.get('returned', 0)))
+        print(f'{count} pages')
+except:
+    print('parse error')
+" 2>/dev/null || echo "?")
+
+    echo "${DISCOVERED}"
+
+    # Rate limit: 2 seconds between requests
+    sleep 2
+  done
+
+  echo ""
+done
+
+echo "=== Import Complete ==="
+echo ""
+
+# Check final brain state
+echo "=== Brain State After Import ==="
+curl -s "${BRAIN_URL}/v1/status" | python3 -c "
+import sys,json; d=json.load(sys.stdin)
+print(f'Memories: {d[\"total_memories\"]}')
+print(f'Graph: {d[\"graph_edges\"]} edges')
+print(f'Sparsifier: {d[\"sparsifier_compression\"]:.1f}x')
+print(f'Clusters: {d[\"cluster_count\"]}')
+"