diff --git a/crates/mcp-brain-server/Cargo.lock b/crates/mcp-brain-server/Cargo.lock index 6f8bd86a..93e85d10 100644 --- a/crates/mcp-brain-server/Cargo.lock +++ b/crates/mcp-brain-server/Cargo.lock @@ -1381,6 +1381,7 @@ dependencies = [ "ed25519-dalek", "hex", "nanosecond-scheduler", + "ndarray 0.15.6", "parking_lot", "rand 0.8.5", "reqwest", diff --git a/crates/mcp-brain-server/Cargo.toml b/crates/mcp-brain-server/Cargo.toml index 3f3eb654..6880a724 100644 --- a/crates/mcp-brain-server/Cargo.toml +++ b/crates/mcp-brain-server/Cargo.toml @@ -72,6 +72,9 @@ temporal-attractor-studio = "0.1" temporal-neural-solver = { version = "0.1", optional = true } # x86_64 only strange-loop = "0.3" +# ndarray — required for temporal-neural-solver Array1 interop (ADR-094) +ndarray = "0.15" + [features] default = [] x86-simd = ["temporal-neural-solver"] # Enable on x86_64 systems diff --git a/crates/mcp-brain-server/src/lib.rs b/crates/mcp-brain-server/src/lib.rs index c82a43de..a1010104 100644 --- a/crates/mcp-brain-server/src/lib.rs +++ b/crates/mcp-brain-server/src/lib.rs @@ -25,3 +25,5 @@ pub mod verify; pub mod voice; pub mod symbolic; pub mod optimizer; +pub mod web_memory; +pub mod web_ingest; diff --git a/crates/mcp-brain-server/src/web_ingest.rs b/crates/mcp-brain-server/src/web_ingest.rs new file mode 100644 index 00000000..42cafd5c --- /dev/null +++ b/crates/mcp-brain-server/src/web_ingest.rs @@ -0,0 +1,433 @@ +//! Web memory ingestion pipeline for π.ruv.io (ADR-094). +//! +//! Implements the 7-phase ingestion pipeline: +//! 1. Validate cleaned pages +//! 2. Content hash deduplication (SHAKE-256) +//! 3. Chunk + embed via ruvLLM +//! 4. Novelty scoring against existing memories +//! 5. Compression tier assignment +//! 6. Graph construction (link edges) +//! 7. Proof verification + store +//! +//! Integrates with midstream crate for attractor analysis and temporal +//! solver scoring on ingested content. + +use crate::embeddings::{EmbeddingEngine, EMBED_DIM}; +use crate::graph::{cosine_similarity, KnowledgeGraph}; +use crate::types::{BetaParams, BrainCategory, BrainMemory}; +use crate::web_memory::*; +use chrono::Utc; +use sha3::{Digest, Sha3_256}; +use std::collections::{HashMap, HashSet}; +use uuid::Uuid; + +/// Maximum pages per ingest batch. +const MAX_BATCH_SIZE: usize = 500; + +/// Maximum text length per page (bytes). +const MAX_TEXT_LENGTH: usize = 1_000_000; + +/// Minimum text length to consider a page worth ingesting. +const MIN_TEXT_LENGTH: usize = 50; + +/// Chunk size in characters (approximate 512 tokens). +const CHUNK_SIZE: usize = 2048; + +/// Overlap between chunks in characters. +const CHUNK_OVERLAP: usize = 256; + +/// Novelty threshold below which pages are considered near-duplicates. +const NEAR_DUPLICATE_THRESHOLD: f32 = 0.98; + +// ── Pipeline Entry Point ──────────────────────────────────────────────── + +/// Ingest a batch of cleaned web pages into the shared memory plane. +/// +/// Returns statistics about accepted/rejected pages and compression. +pub fn ingest_batch( + pages: &[CleanedPage], + crawl_source: &str, + embedding_engine: &EmbeddingEngine, + graph: &mut KnowledgeGraph, + existing_hashes: &HashSet, + existing_embeddings: &[(Uuid, Vec)], +) -> (Vec, WebIngestResponse) { + let mut accepted = Vec::new(); + let mut rejected = 0usize; + let mut stats = CompressionStats { + full_tier: 0, + delta_compressed: 0, + centroid_merged: 0, + dedup_skipped: 0, + }; + + let batch = if pages.len() > MAX_BATCH_SIZE { + &pages[..MAX_BATCH_SIZE] + } else { + pages + }; + + for page in batch { + // Phase 1: Validate + if let Err(_reason) = validate_page(page) { + rejected += 1; + continue; + } + + // Phase 2: Deduplication via content hash + let content_hash = compute_content_hash(&page.text); + if existing_hashes.contains(&content_hash) { + stats.dedup_skipped += 1; + rejected += 1; + continue; + } + + // Phase 3: Chunk + Embed + let chunks = chunk_text(&page.text); + let embeddings: Vec> = if !page.embedding.is_empty() + && page.embedding.len() == EMBED_DIM + { + // Use pre-computed embedding for first chunk + let mut embs = vec![page.embedding.clone()]; + for chunk in chunks.iter().skip(1) { + let text = EmbeddingEngine::prepare_text(&page.title, chunk, &page.tags); + embs.push(embedding_engine.embed_for_storage(&text)); + } + embs + } else { + chunks + .iter() + .map(|chunk| { + let text = EmbeddingEngine::prepare_text(&page.title, chunk, &page.tags); + embedding_engine.embed_for_storage(&text) + }) + .collect() + }; + + // Phase 4: Novelty scoring — compare against existing memories + let primary_embedding = embeddings.first().cloned().unwrap_or_else(|| vec![0.0; EMBED_DIM]); + let novelty = compute_novelty(&primary_embedding, existing_embeddings); + + // Phase 5: Compression tier + let tier = CompressionTier::from_novelty(novelty); + + match tier { + CompressionTier::Full => stats.full_tier += 1, + CompressionTier::DeltaCompressed => stats.delta_compressed += 1, + CompressionTier::CentroidMerged => stats.centroid_merged += 1, + CompressionTier::Archived => {} + } + + // Phase 6 + 7: Construct WebMemory and store + let domain = extract_domain(&page.url); + let memory_id = Uuid::new_v4(); + let now = Utc::now(); + + let witness_hash = compute_witness_hash(&content_hash, &memory_id.to_string()); + + let base = BrainMemory { + id: memory_id, + category: BrainCategory::Custom("web".to_string()), + title: truncate(&page.title, 200), + content: if tier.is_hot() { + truncate(&page.text, 5000) + } else { + String::new() + }, + tags: page.tags.clone(), + code_snippet: None, + embedding: primary_embedding, + contributor_id: format!("web:{crawl_source}"), + quality_score: BetaParams::new(), + partition_id: None, + witness_hash: witness_hash.clone(), + rvf_gcs_path: None, + redaction_log: None, + dp_proof: None, + witness_chain: None, + created_at: now, + updated_at: now, + }; + + let web_mem = WebMemory { + base, + source_url: page.url.clone(), + domain, + content_hash, + crawl_timestamp: now, + crawl_source: crawl_source.to_string(), + language: page.language.clone(), + outbound_links: page.links.clone(), + compression_tier: tier, + novelty_score: novelty, + }; + + // Phase 6: Add to knowledge graph + graph.add_memory(&web_mem.base); + + accepted.push(web_mem); + } + + let memory_ids: Vec = accepted.iter().map(|m| m.base.id).collect(); + let response = WebIngestResponse { + accepted: accepted.len(), + rejected, + memory_ids, + compression: stats, + }; + + (accepted, response) +} + +// ── Pipeline Phases ───────────────────────────────────────────────────── + +/// Phase 1: Validate a cleaned page. +fn validate_page(page: &CleanedPage) -> Result<(), &'static str> { + if page.url.is_empty() { + return Err("empty URL"); + } + if page.text.len() < MIN_TEXT_LENGTH { + return Err("text too short"); + } + if page.text.len() > MAX_TEXT_LENGTH { + return Err("text too long"); + } + if page.title.is_empty() { + return Err("empty title"); + } + // Basic URL validation + if !page.url.starts_with("http://") && !page.url.starts_with("https://") { + return Err("invalid URL scheme"); + } + Ok(()) +} + +/// Phase 2: Compute SHAKE-256 content hash for deduplication. +fn compute_content_hash(text: &str) -> String { + // Normalize: lowercase, collapse whitespace + let normalized: String = text + .to_lowercase() + .split_whitespace() + .collect::>() + .join(" "); + let mut hasher = Sha3_256::new(); + hasher.update(normalized.as_bytes()); + hex::encode(hasher.finalize()) +} + +/// Phase 3: Split text into overlapping chunks. +fn chunk_text(text: &str) -> Vec { + if text.len() <= CHUNK_SIZE { + return vec![text.to_string()]; + } + + let mut chunks = Vec::new(); + let chars: Vec = text.chars().collect(); + let mut start = 0; + + while start < chars.len() { + let end = (start + CHUNK_SIZE).min(chars.len()); + let chunk: String = chars[start..end].iter().collect(); + chunks.push(chunk); + + if end >= chars.len() { + break; + } + // Advance by chunk_size - overlap + start += CHUNK_SIZE - CHUNK_OVERLAP; + } + + chunks +} + +/// Phase 4: Compute novelty score as 1.0 - max_cosine_similarity. +fn compute_novelty(embedding: &[f32], existing: &[(Uuid, Vec)]) -> f32 { + if existing.is_empty() { + return 1.0; + } + + let max_sim = existing + .iter() + .map(|(_, e)| cosine_similarity(embedding, e) as f32) + .fold(f32::NEG_INFINITY, f32::max); + + (1.0 - max_sim).max(0.0) +} + +/// Compute witness hash from content hash + memory ID. +fn compute_witness_hash(content_hash: &str, memory_id: &str) -> String { + let mut hasher = Sha3_256::new(); + hasher.update(content_hash.as_bytes()); + hasher.update(b":"); + hasher.update(memory_id.as_bytes()); + hex::encode(hasher.finalize()) +} + +/// Extract domain from a URL. +fn extract_domain(url: &str) -> String { + url.split("://") + .nth(1) + .unwrap_or(url) + .split('/') + .next() + .unwrap_or("unknown") + .to_lowercase() +} + +/// Truncate a string to a maximum byte length, preserving UTF-8 boundaries. +fn truncate(s: &str, max_bytes: usize) -> String { + if s.len() <= max_bytes { + return s.to_string(); + } + let mut end = max_bytes; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + s[..end].to_string() +} + +// ── Midstream Integration ─────────────────────────────────────────────── + +/// Score a web memory using midstream attractor analysis. +/// +/// Stable domains (negative Lyapunov exponent) get a recrawl frequency +/// reduction bonus. Chaotic domains need more frequent recrawl. +pub fn attractor_recrawl_priority( + domain: &str, + attractor_results: &HashMap, +) -> f32 { + match attractor_results.get(domain) { + Some(result) if result.lambda < -0.5 => { + // Very stable — low recrawl priority + 0.1 + } + Some(result) if result.lambda < 0.0 => { + // Stable — moderate recrawl priority + 0.3 + } + Some(result) if result.lambda > 0.5 => { + // Chaotic — high recrawl priority + 0.9 + } + Some(_) => 0.5, + None => 0.5, // Unknown — default priority + } +} + +/// Use temporal solver to predict content drift for a domain. +/// +/// High-confidence stability predictions → lower crawl frequency. +pub fn solver_drift_prediction( + solver: &mut temporal_neural_solver::TemporalSolver, + recent_embeddings: &[Vec], +) -> Option { + if recent_embeddings.len() < 3 { + return None; + } + + // Convert to Array1 for the solver + let last = recent_embeddings.last()?; + let input = ndarray::Array1::from_vec(last.clone()); + + let (_prediction, cert, _duration) = solver.predict(&input).ok()?; + if cert.gate_pass { + Some(cert.confidence as f32) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_page() { + let valid = CleanedPage { + url: "https://example.com/page".into(), + text: "a".repeat(100), + title: "Test Page".into(), + meta_description: String::new(), + links: vec![], + language: "en".into(), + embedding: vec![], + tags: vec![], + }; + assert!(validate_page(&valid).is_ok()); + + let empty_url = CleanedPage { url: String::new(), ..valid.clone() }; + assert!(validate_page(&empty_url).is_err()); + + let short_text = CleanedPage { text: "too short".into(), ..valid.clone() }; + assert!(validate_page(&short_text).is_err()); + } + + #[test] + fn test_content_hash_normalization() { + let h1 = compute_content_hash("Hello World"); + let h2 = compute_content_hash("hello world"); + assert_eq!(h1, h2); + } + + #[test] + fn test_chunk_text_short() { + let chunks = chunk_text("Short text"); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0], "Short text"); + } + + #[test] + fn test_chunk_text_long() { + let text = "a".repeat(5000); + let chunks = chunk_text(&text); + assert!(chunks.len() > 1); + // Verify overlap: last chars of chunk[0] == first chars of chunk[1] offset + assert!(chunks[0].len() == CHUNK_SIZE); + } + + #[test] + fn test_extract_domain() { + assert_eq!(extract_domain("https://example.com/path"), "example.com"); + assert_eq!(extract_domain("http://sub.example.com/"), "sub.example.com"); + assert_eq!(extract_domain("https://EXAMPLE.COM/Path"), "example.com"); + } + + #[test] + fn test_compute_novelty_empty() { + let emb = vec![1.0; 128]; + assert_eq!(compute_novelty(&emb, &[]), 1.0); + } + + #[test] + fn test_compute_novelty_duplicate() { + let emb = vec![1.0; 128]; + let existing = vec![(Uuid::new_v4(), vec![1.0; 128])]; + let novelty = compute_novelty(&emb, &existing); + assert!(novelty < 0.01, "Identical vectors should have near-zero novelty"); + } + + #[test] + fn test_attractor_recrawl_priority() { + let mut results = HashMap::new(); + results.insert("stable.com".to_string(), temporal_attractor_studio::LyapunovResult { + lambda: -1.0, + convergence: true, + iterations: 10, + }); + results.insert("chaotic.com".to_string(), temporal_attractor_studio::LyapunovResult { + lambda: 1.0, + convergence: false, + iterations: 10, + }); + + assert!(attractor_recrawl_priority("stable.com", &results) < 0.2); + assert!(attractor_recrawl_priority("chaotic.com", &results) > 0.8); + assert_eq!(attractor_recrawl_priority("unknown.com", &results), 0.5); + } + + #[test] + fn test_truncate() { + assert_eq!(truncate("hello", 10), "hello"); + assert_eq!(truncate("hello world", 5), "hello"); + } +} diff --git a/crates/mcp-brain-server/src/web_memory.rs b/crates/mcp-brain-server/src/web_memory.rs new file mode 100644 index 00000000..38a50ac1 --- /dev/null +++ b/crates/mcp-brain-server/src/web_memory.rs @@ -0,0 +1,455 @@ +//! Web memory types for π.ruv.io shared web memory platform (ADR-094). +//! +//! Extends the brain server's memory model with web-sourced memory objects, +//! temporal page deltas, link edges for graph construction, and compression +//! tier management. + +use chrono::{DateTime, Duration, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::types::{BetaParams, BrainCategory, BrainMemory}; + +// ── Core Web Memory Types ─────────────────────────────────────────────── + +/// A web-sourced memory object in the shared memory plane. +/// Extends BrainMemory with provenance, temporal compression, and link metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WebMemory { + /// Core brain memory (embedding, quality, witness, etc.) + #[serde(flatten)] + pub base: BrainMemory, + /// Source URL (canonical, after redirect resolution) + pub source_url: String, + /// Domain extracted from source_url + pub domain: String, + /// Content hash (SHAKE-256) for deduplication + pub content_hash: String, + /// Crawl timestamp (when the content was fetched) + pub crawl_timestamp: DateTime, + /// Crawl source identifier (e.g., "cc-2026-09") + pub crawl_source: String, + /// Language code (ISO 639-1) + pub language: String, + /// Outbound link URLs (for graph construction) + pub outbound_links: Vec, + /// Temporal compression tier + pub compression_tier: CompressionTier, + /// Novelty score relative to existing memory (0.0 = duplicate, 1.0 = entirely new) + pub novelty_score: f32, +} + +/// Temporal compression tiers (ADR-017 alignment). +/// +/// Determines how much data is retained for a given memory object based on +/// its novelty relative to existing memories. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CompressionTier { + /// Full embedding + content stored (high novelty, first seen) + Full, + /// Embedding stored, content as delta from nearest neighbor + DeltaCompressed, + /// Only centroid contribution stored (near-duplicate) + CentroidMerged, + /// Archived — retrievable from GCS but not in hot memory + Archived, +} + +impl CompressionTier { + /// Assign tier based on novelty score. + /// + /// INV-5: Compression tier assignment matches novelty score deterministically. + pub fn from_novelty(novelty: f32) -> Self { + if novelty < 0.05 { + Self::CentroidMerged + } else if novelty < 0.20 { + Self::DeltaCompressed + } else { + Self::Full + } + } + + /// Whether this tier should be kept in hot (in-memory) storage. + pub fn is_hot(&self) -> bool { + matches!(self, Self::Full | Self::DeltaCompressed) + } +} + +// ── Temporal Evolution ────────────────────────────────────────────────── + +/// Tracks how a web page changes across crawls. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WebPageDelta { + pub id: Uuid, + /// URL of the page being tracked + pub page_url: String, + /// Memory ID of the previous version + pub previous_memory_id: Uuid, + /// Memory ID of the current version + pub current_memory_id: Uuid, + /// Cosine similarity between previous and current embeddings + pub embedding_drift: f32, + /// Content diff summary + pub content_delta: ContentDelta, + /// Time between crawls + #[serde(with = "duration_serde")] + pub time_delta: Duration, + /// Whether this delta crossed a mincut partition boundary + pub boundary_crossing: bool, + /// Timestamp of this delta + pub created_at: DateTime, +} + +/// Structured content change classification. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum ContentDelta { + /// Content unchanged (hash match) + Unchanged, + /// Minor update (< 5% token change) + Minor { + changed_tokens: usize, + total_tokens: usize, + }, + /// Major revision (≥ 5% token change) + Major { + summary: String, + changed_tokens: usize, + }, + /// Complete rewrite (cosine < 0.7) + Rewrite, +} + +impl ContentDelta { + /// Classify content change based on token counts and embedding similarity. + pub fn classify( + changed_tokens: usize, + total_tokens: usize, + embedding_cosine: f32, + ) -> Self { + if changed_tokens == 0 { + return Self::Unchanged; + } + if embedding_cosine < 0.7 { + return Self::Rewrite; + } + let change_ratio = if total_tokens > 0 { + changed_tokens as f64 / total_tokens as f64 + } else { + 1.0 + }; + if change_ratio < 0.05 { + Self::Minor { + changed_tokens, + total_tokens, + } + } else { + Self::Major { + summary: format!("{changed_tokens}/{total_tokens} tokens changed"), + changed_tokens, + } + } + } + + /// Whether this delta is significant enough to warrant re-embedding. + pub fn is_significant(&self) -> bool { + matches!(self, Self::Major { .. } | Self::Rewrite) + } +} + +// ── Graph Construction ────────────────────────────────────────────────── + +/// A directed edge from source page to target page in the knowledge graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LinkEdge { + pub source_memory_id: Uuid, + pub target_memory_id: Uuid, + /// Anchor text embedding (if meaningful anchor text exists) + pub anchor_embedding: Option>, + /// Link context: surrounding paragraph embedding + pub context_embedding: Vec, + /// Link type classification + pub link_type: LinkType, + /// Weight based on semantic relevance (INV-6: clamped to [0.0, 1.0]) + pub weight: f64, +} + +impl LinkEdge { + /// Create a new link edge with weight clamped to [0.0, 1.0] (INV-6). + pub fn new( + source: Uuid, + target: Uuid, + anchor_embedding: Option>, + context_embedding: Vec, + link_type: LinkType, + weight: f64, + ) -> Self { + Self { + source_memory_id: source, + target_memory_id: target, + anchor_embedding, + context_embedding, + link_type, + weight: weight.clamp(0.0, 1.0), + } + } +} + +/// Classification of link relationships between pages. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LinkType { + /// Informational reference + Citation, + /// Navigational (same-site) + Navigation, + /// Supporting evidence + Evidence, + /// Contradiction or rebuttal (INV-8: must be symmetric) + Contradiction, + /// Unknown / unclassified + Unknown, +} + +// ── API Request/Response Types ────────────────────────────────────────── + +/// Batch ingest request for web pages. +#[derive(Debug, Deserialize)] +pub struct WebIngestRequest { + /// Batch of cleaned pages to ingest + pub pages: Vec, + /// Crawl source identifier + pub crawl_source: String, +} + +/// A cleaned web page ready for ingestion. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CleanedPage { + /// Canonical URL + pub url: String, + /// Cleaned text content (HTML stripped) + pub text: String, + /// Page title + pub title: String, + /// Meta description + #[serde(default)] + pub meta_description: String, + /// Outbound link URLs + #[serde(default)] + pub links: Vec, + /// Language code (ISO 639-1) + #[serde(default = "default_language")] + pub language: String, + /// Optional pre-computed embedding (128-dim) + #[serde(default)] + pub embedding: Vec, + /// Content tags + #[serde(default)] + pub tags: Vec, +} + +fn default_language() -> String { + "en".to_string() +} + +/// Response for batch ingest. +#[derive(Debug, Serialize)] +pub struct WebIngestResponse { + /// Number of pages accepted + pub accepted: usize, + /// Number of pages rejected (duplicates, validation failures) + pub rejected: usize, + /// IDs of accepted memories + pub memory_ids: Vec, + /// Compression statistics + pub compression: CompressionStats, +} + +/// Compression statistics for an ingest batch. +#[derive(Debug, Serialize)] +pub struct CompressionStats { + pub full_tier: usize, + pub delta_compressed: usize, + pub centroid_merged: usize, + pub dedup_skipped: usize, +} + +/// Query for web search. +#[derive(Debug, Deserialize)] +pub struct WebSearchQuery { + /// Text query + pub q: Option, + /// Pre-computed query embedding + pub embedding: Option>, + /// Filter by domain + pub domain: Option, + /// Filter by language + pub language: Option, + /// Filter by crawl source + pub crawl_source: Option, + /// Maximum results + pub limit: Option, + /// Minimum novelty score + pub min_novelty: Option, +} + +/// Query for contradiction detection. +#[derive(Debug, Deserialize)] +pub struct ContradictionQuery { + /// Topic to search for contradictions + pub topic: String, + /// Minimum contradiction strength + pub min_strength: Option, + /// Maximum results + pub limit: Option, +} + +/// A detected contradiction between two sources. +#[derive(Debug, Serialize)] +pub struct Contradiction { + pub source_a: WebMemorySummary, + pub source_b: WebMemorySummary, + /// Contradiction strength (higher = more contradictory) + pub strength: f64, + /// Mincut partition boundary info + pub partition_boundary: bool, +} + +/// Lightweight summary of a web memory (avoids full embedding in responses). +#[derive(Debug, Serialize)] +pub struct WebMemorySummary { + pub id: Uuid, + pub title: String, + pub source_url: String, + pub domain: String, + pub novelty_score: f32, + pub quality_score: f64, + pub crawl_timestamp: DateTime, +} + +/// Query for temporal evolution tracking. +#[derive(Debug, Deserialize)] +pub struct EvolutionQuery { + /// URL or topic to track + pub url: Option, + pub topic: Option, + /// Time range + pub since: Option>, + pub until: Option>, + pub limit: Option, +} + +/// Response for evolution tracking. +#[derive(Debug, Serialize)] +pub struct EvolutionResponse { + pub url: Option, + pub deltas: Vec, + pub total_drift: f64, + pub trend: String, +} + +/// Query for novelty detection. +#[derive(Debug, Deserialize)] +pub struct NoveltyQuery { + /// Only show clusters emerging after this date + pub since: Option>, + /// Minimum cluster size + pub min_size: Option, + pub limit: Option, +} + +/// An emerging knowledge cluster. +#[derive(Debug, Serialize)] +pub struct EmergingCluster { + pub cluster_id: u32, + pub dominant_topic: String, + pub size: usize, + pub avg_novelty: f32, + pub first_seen: DateTime, + pub growth_rate: f64, +} + +/// Web memory status overview. +#[derive(Debug, Serialize)] +pub struct WebMemoryStatus { + pub total_web_memories: usize, + pub total_domains: usize, + pub total_link_edges: usize, + pub total_page_deltas: usize, + pub compression_ratio: f64, + pub tier_distribution: TierDistribution, + pub top_domains: Vec, +} + +#[derive(Debug, Serialize)] +pub struct TierDistribution { + pub full: usize, + pub delta_compressed: usize, + pub centroid_merged: usize, + pub archived: usize, +} + +#[derive(Debug, Serialize)] +pub struct DomainStats { + pub domain: String, + pub page_count: usize, + pub avg_quality: f64, + pub avg_novelty: f32, +} + +// ── Duration Serde ────────────────────────────────────────────────────── + +mod duration_serde { + use chrono::Duration; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + + pub fn serialize(duration: &Duration, serializer: S) -> Result { + duration.num_seconds().serialize(serializer) + } + + pub fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result { + let secs = i64::deserialize(deserializer)?; + Ok(Duration::seconds(secs)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn compression_tier_from_novelty() { + assert_eq!(CompressionTier::from_novelty(0.0), CompressionTier::CentroidMerged); + assert_eq!(CompressionTier::from_novelty(0.04), CompressionTier::CentroidMerged); + assert_eq!(CompressionTier::from_novelty(0.05), CompressionTier::DeltaCompressed); + assert_eq!(CompressionTier::from_novelty(0.19), CompressionTier::DeltaCompressed); + assert_eq!(CompressionTier::from_novelty(0.20), CompressionTier::Full); + assert_eq!(CompressionTier::from_novelty(1.0), CompressionTier::Full); + } + + #[test] + fn compression_tier_is_hot() { + assert!(CompressionTier::Full.is_hot()); + assert!(CompressionTier::DeltaCompressed.is_hot()); + assert!(!CompressionTier::CentroidMerged.is_hot()); + assert!(!CompressionTier::Archived.is_hot()); + } + + #[test] + fn content_delta_classify() { + assert!(matches!(ContentDelta::classify(0, 100, 1.0), ContentDelta::Unchanged)); + assert!(matches!(ContentDelta::classify(3, 100, 0.99), ContentDelta::Minor { .. })); + assert!(matches!(ContentDelta::classify(10, 100, 0.85), ContentDelta::Major { .. })); + assert!(matches!(ContentDelta::classify(50, 100, 0.5), ContentDelta::Rewrite)); + } + + #[test] + fn link_edge_weight_clamped() { + let edge = LinkEdge::new(Uuid::new_v4(), Uuid::new_v4(), None, vec![0.0; 128], LinkType::Citation, 1.5); + assert_eq!(edge.weight, 1.0); + + let edge2 = LinkEdge::new(Uuid::new_v4(), Uuid::new_v4(), None, vec![0.0; 128], LinkType::Citation, -0.5); + assert_eq!(edge2.weight, 0.0); + } +} diff --git a/docs/adr/ADR-094-pi-shared-web-memory.md b/docs/adr/ADR-094-pi-shared-web-memory.md new file mode 100644 index 00000000..b7d8815c --- /dev/null +++ b/docs/adr/ADR-094-pi-shared-web-memory.md @@ -0,0 +1,657 @@ +# ADR-094: π.ruv.io Shared Web Memory on RuVector + +**Status**: Proposed +**Date**: 2026-03-14 +**Authors**: ruv +**Deciders**: ruv, RuVector Architecture Team +**Technical Area**: Web-scale ingestion / Shared agent memory / Cloud orchestration / Compression / Temporal storage +**Related**: ADR-030 (RVF Computational Container), ADR-040 (Cognitum Swarm), ADR-047 (Proof-Gated Mutation Protocol), ADR-058 (MCP Tool Groups), ADR-059 (Shared Brain — Google Cloud), ADR-060 (Shared Brain Capabilities), ADR-077 (Midstream Platform Integration), ADR-091 (INT8 CNN Quantization), ADR-017 (Temporal Tensor Compression) + +## Version History + +| Version | Date | Author | Changes | +|---------|------|--------|---------| +| 0.1 | 2026-03-14 | ruv | Initial proposal — architecture, storage model, ingestion pipeline | +| 0.2 | 2026-03-15 | RuVector Team | Added implementation phases, cost model, acceptance criteria | + +--- + +## Decision Statement + +**ADR-094 establishes π.ruv.io as a RuVector-native shared web memory platform that ingests large public corpora (starting with Common Crawl), compresses them into structured semantic memory objects, and exposes them as a queryable knowledge substrate for agents, models, and users.** + +This decision prioritizes: +- Retrieval-first architecture over model-training-first pipelines +- Temporal compression as a first-class storage primitive (ADR-017) +- Proof-gated mutation for all agent writes (ADR-047) +- Hybrid local/cloud deployment for cost control +- Coherence and contrast as query primitives via mincut boundaries + +**Acceptance Benchmark**: Ingest ≥1M Common Crawl pages, compress to ≤40% of raw embedding storage via temporal deduplication, achieve p95 semantic retrieval latency ≤50ms for 10M memory objects, with full provenance chains on every stored object. + +--- + +## 1. Context and Problem Statement + +### 1.1 Current State + +π.ruv.io currently operates as a shared brain for Claude Code sessions (ADR-059, ADR-060). The `mcp-brain-server` crate provides: + +| Component | Path | Current State | +|-----------|------|---------------| +| Memory store | `crates/mcp-brain-server/src/store.rs` | DashMap + Firestore, session-contributed memories | +| Knowledge graph | `crates/mcp-brain-server/src/graph.rs` | ruvector-mincut + ruvector-solver PPR search | +| Embeddings | `crates/mcp-brain-server/src/embeddings.rs` | HashEmbedder + RlmEmbedder, 128-dim | +| Cognitive engine | `crates/mcp-brain-server/src/cognitive.rs` | Hopfield + DentateGyrus + HDC | +| Midstream | `crates/mcp-brain-server/src/midstream.rs` | Lyapunov attractor + temporal solver + strange loop | +| RVF pipeline | `crates/mcp-brain-server/src/pipeline.rs` | VEC + META + WITNESS + DP segments | +| Temporal tracking | `types.rs` (DeltaStream) | VectorDelta per-memory, knowledge velocity | + +### 1.2 Problem + +The current system accepts only agent-contributed memories from Claude Code sessions. There is no mechanism to: + +1. **Ingest web-scale corpora** — Common Crawl alone is ~3.5 billion pages per crawl +2. **Compress and retain only semantic structure** — raw HTML wastes storage; boilerplate templates repeat across millions of pages +3. **Track temporal evolution** — the same URL may change content across crawls; deltas must collapse when novelty is low +4. **Build graph structure from hyperlinks** — link topology encodes authority, relevance, and domain relationships +5. **Detect contradictions** — conflicting claims across sources are coherence signals, not noise +6. **Scale writes safely** — agent swarms reading from and writing to shared memory need proof-gated mutation at scale + +Without this architecture, π.ruv.io risks becoming: +- An expensive web archive (storing raw data without compression) +- A conventional vector database (embeddings without graph or temporal structure) +- An opaque training pipeline (data in, model out, no shared retrieval) +- A static RAG backend (retrieval without shared learning or evolution tracking) + +### 1.3 Opportunity + +Treating the public web as a continuously evolving external memory layer where: +- Pages become structured **memory objects** (`WebMemory`) +- Semantic meaning becomes **vector state** (128-dim embeddings via ruvLLM) +- Links and references become **graph structure** (KnowledgeGraph edges with PPR) +- Change over time becomes **temporal deltas** (DeltaStream from ruvector-delta-core) +- Contradictions become **coherence signals** (mincut partition boundaries) +- Writes remain **proof-gated** (ADR-047 ProofGate) + +--- + +## 2. Decision + +We will implement π.ruv.io as a RuVector-native shared web memory platform with the following core decisions: + +### 2.1 RuVector is the System of Record for Semantic Memory + +All cleaned web content, embeddings, graph relationships, temporal deltas, and coherence metadata are stored in RuVector's memory plane. The `mcp-brain-server` store is extended with web-specific types (`WebMemory`, `WebPageDelta`, `LinkEdge`) that compose with the existing `BrainMemory` infrastructure. + +### 2.2 Retrieval-First, Not Model-Training-First + +Initial implementation focuses on shared retrieval, reasoning, clustering, provenance, and agent memory. The existing search, partition, and transfer APIs (ADR-059) are extended for web-scale queries. Fine-tuning or model training is optional and outside the critical path — the LoRA federation (ADR-060) can optionally train on web-derived preference signals. + +### 2.3 Temporal Compression is a First-Class Primitive + +Memory is stored as stable state plus deltas over time (extending `DeltaStream` from ADR-017). Repeated, templated, or low-novelty content collapses aggressively: + +| Content Signal | Compression Action | +|---|---| +| Boilerplate (nav, footer, cookie banners) | Strip before embedding; deduplicate via content hash | +| Template pages (product listings, directory entries) | Collapse to schema + per-instance delta | +| Low-novelty recrawls (< 5% content change) | Store as temporal delta, not full re-embedding | +| Near-duplicate pages (cosine > 0.98) | Merge into cluster centroid with provenance list | +| Contradictory content (mincut boundary crossing) | Preserve both sides with contradiction edge | + +### 2.4 Hybrid Local/Cloud Deployment + +| Layer | Hardware | Responsibility | +|---|---|---| +| Fetch + filter | Cloud (Cloud Run jobs) | WARC download, language detection, robots.txt compliance | +| Preprocess | Local (Mac Studio M2 Ultra / Mac mini M4) | HTML cleaning, deduplication, chunking, optional local embedding via ruvLLM | +| Ingest API | Cloud (Cloud Run service) | Validation, job dispatch, write orchestration, proof verification | +| Storage | Cloud (Firestore + GCS) | Persistent memory objects, RVF containers, temporal deltas | +| Query | Cloud (Cloud Run service) | Semantic search, graph traversal, coherence analysis | + +### 2.5 Proof-Gated Mutation Governs All Writes + +All agent-contributed memory updates must carry provenance, policy validation, and mutation proofs before becoming canonical (ADR-047). Web-ingested content uses a simplified proof path: + +``` +ProofRequirement::Composite(vec![ + ProofRequirement::TypeMatch { schema_id: WEB_MEMORY_SCHEMA }, + ProofRequirement::InvariantPreserved { invariant_id: PROVENANCE_CHAIN }, + ProofRequirement::CoherenceBound { min_coherence: 0.3 }, +]) +``` + +### 2.6 Coherence and Contrast are Query Primitives + +Dynamic mincut (ruvector-mincut), contradiction edges, graph partition boundaries, and novelty scoring are exposed as first-class indexing and reasoning features: + +- `GET /v1/web/contradictions?topic=X` — find conflicting claims across sources +- `GET /v1/web/novelty?since=2026-03-01` — detect newly emerging knowledge clusters +- `GET /v1/web/coherence?cluster_id=N` — measure internal consistency of a knowledge cluster +- `GET /v1/web/evolution?url=X` — temporal delta history for a specific source + +--- + +## 3. Decision Drivers + +| Driver | Requirement | Metric | +|---|---|---| +| Cost | Avoid full raw-cloud retention and excessive managed embedding spend | ≤$500/month for 10M memory objects | +| Auditability | Preserve source provenance and mutation history | 100% of objects have witness chains | +| Shared intelligence | Many agents contributing to and reading from one memory substrate | Support ≥100 concurrent agent sessions | +| Compression | Exploit RuVector temporal compression and structured retention | ≥60% storage reduction vs. raw embeddings | +| Performance | Semantic retrieval and graph traversal at production latency | p95 ≤50ms for search, p95 ≤100ms for graph traversal | +| Safety | Proof-gated write paths and scoped authority | Zero unprovenanced writes to canonical memory | +| Extensibility | Support future RVF packaging, agent swarms, and Cognitum edge nodes | Clean bounded contexts per DDD | + +--- + +## 4. High-Level Architecture + +### 4.1 Logical Topology + +```text +Common Crawl / Public Corpora + │ + ▼ +┌─────────────────────┐ +│ Fetch + Filter │ Cloud Run Jobs +│ WARC download │ Language detection +│ robots.txt check │ Size/quality gates +└─────────┬───────────┘ + │ + ▼ +┌─────────────────────┐ +│ Local Preprocess │ Mac Studio / Mac mini +│ HTML → text │ Boilerplate strip +│ Deduplication │ Content hashing +│ Chunking (512 tok) │ ruvLLM embedding (opt) +└─────────┬───────────┘ + │ + ▼ +┌─────────────────────┐ +│ Cloud Ingest API │ Cloud Run (π.ruv.io) +│ Validation │ Proof verification +│ Job dispatch │ Write orchestration +│ Rate limiting │ Byzantine aggregation +└─────────┬───────────┘ + │ + ▼ +┌──────────────────────────────────────────┐ +│ RuVector Shared Memory Plane │ +│ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │ Vectors │ │ Graph │ │ Temporal │ │ +│ │ 128-dim │ │ MinCut │ │ Deltas │ │ +│ │ HNSW idx │ │ PPR rank │ │ ADR-017 │ │ +│ └──────────┘ └──────────┘ └──────────┘ │ +│ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ +│ │Coherence │ │Provenance│ │ Midstream│ │ +│ │ scoring │ │ witness │ │ attractor│ │ +│ │ mincut │ │ chains │ │ solver │ │ +│ └──────────┘ └──────────┘ └──────────┘ │ +└──────────┬───────────────────┬──────────┘ + │ │ + ▼ ▼ +┌─────────────────┐ ┌─────────────────┐ +│ Agent Query API │ │ Admin/Analytics │ +│ semantic search │ │ health, compact │ +│ graph traverse │ │ drift, audit │ +│ contrast query │ │ cost tracking │ +└─────────────────┘ └─────────────────┘ +``` + +### 4.2 Data Model + +#### WebMemory (extends BrainMemory) + +```rust +/// A web-sourced memory object in the shared memory plane +pub struct WebMemory { + /// Core brain memory fields (embedding, quality, witness, etc.) + pub base: BrainMemory, + /// Source URL (canonical, after redirect resolution) + pub source_url: String, + /// Domain extracted from source_url + pub domain: String, + /// Content hash (SHAKE-256) for deduplication + pub content_hash: String, + /// Crawl timestamp (when the content was fetched) + pub crawl_timestamp: DateTime, + /// Crawl source identifier (e.g., "cc-2026-09") + pub crawl_source: String, + /// Language code (ISO 639-1) + pub language: String, + /// Outbound link URLs (for graph construction) + pub outbound_links: Vec, + /// Temporal compression tier + pub compression_tier: CompressionTier, + /// Novelty score relative to existing memory (0.0 = duplicate, 1.0 = entirely new) + pub novelty_score: f32, +} + +/// Temporal compression tiers (ADR-017 alignment) +pub enum CompressionTier { + /// Full embedding + content stored (high novelty, first seen) + Full, + /// Embedding stored, content as delta from nearest neighbor + DeltaCompressed, + /// Only centroid contribution stored (near-duplicate) + CentroidMerged, + /// Archived — retrievable from GCS but not in hot memory + Archived, +} +``` + +#### WebPageDelta (temporal evolution) + +```rust +/// Tracks how a web page changes across crawls +pub struct WebPageDelta { + pub page_url: String, + pub previous_memory_id: Uuid, + pub current_memory_id: Uuid, + /// Cosine similarity between previous and current embeddings + pub embedding_drift: f32, + /// Content diff summary (not raw diff — structured delta) + pub content_delta: ContentDelta, + /// Crawl interval + pub time_delta: Duration, + /// Whether this delta crossed a mincut boundary + pub boundary_crossing: bool, +} + +pub enum ContentDelta { + /// Content unchanged (hash match) + Unchanged, + /// Minor update (< 5% token change) + Minor { changed_tokens: usize, total_tokens: usize }, + /// Major revision (≥ 5% token change) + Major { summary: String, changed_tokens: usize }, + /// Complete rewrite (cosine < 0.7) + Rewrite, +} +``` + +#### LinkEdge (graph construction) + +```rust +/// A directed edge from source page to target page +pub struct LinkEdge { + pub source_memory_id: Uuid, + pub target_memory_id: Uuid, + /// Anchor text embedding (if meaningful anchor text exists) + pub anchor_embedding: Option>, + /// Link context: surrounding paragraph embedding + pub context_embedding: Vec, + /// Link type classification + pub link_type: LinkType, + /// Weight based on semantic relevance of anchor + context + pub weight: f64, +} + +pub enum LinkType { + /// Informational reference + Citation, + /// Navigational (same-site) + Navigation, + /// Supporting evidence + Evidence, + /// Contradiction or rebuttal + Contradiction, + /// Unknown / unclassified + Unknown, +} +``` + +### 4.3 Ingestion Pipeline + +```text +Phase 1: Fetch + WARC segment → HTTP response → (url, html, headers, timestamp) + Gates: robots.txt compliance, language filter, size limit (1MB) + +Phase 2: Clean + HTML → readable text (via readability/trafilatura equivalent) + Strip: nav, footer, ads, cookie banners, scripts + Extract: title, main content, outbound links, meta description + Output: CleanedPage { url, text, links, title, meta } + +Phase 3: Deduplicate + Content hash (SHAKE-256 of normalized text) + Check against content_hash index in store + If exact match → skip (or record temporal "unchanged" delta) + If near-match (simhash within 3 bits) → flag for delta compression + +Phase 4: Chunk + Embed + Split text into 512-token chunks (with 64-token overlap) + Generate 128-dim embeddings via ruvLLM HashEmbedder + Optional: local ruvLLM RlmEmbedder for higher quality + +Phase 5: Novelty Score + Compare each chunk embedding against nearest neighbors in HNSW index + Novelty = 1.0 - max_cosine_similarity(chunk, existing_memories) + If novelty < 0.05 → CentroidMerged tier + If novelty < 0.20 → DeltaCompressed tier + Else → Full tier + +Phase 6: Graph Construction + For each outbound link in CleanedPage: + Resolve target URL → target memory_id (if exists) + Classify link type from anchor text + context + Create LinkEdge with semantic weight + +Phase 7: Proof + Store + Construct ProofRequirement (Section 2.5) + Build RVF container (pipeline.rs extension) + Store WebMemory + LinkEdges + WebPageDeltas + Update KnowledgeGraph (graph.rs) + Record witness chain +``` + +### 4.4 Midstream Integration + +The existing midstream crate (`crates/mcp-brain-server/src/midstream.rs`) provides three capabilities critical to web memory: + +| Midstream Component | Web Memory Application | +|---|---| +| `temporal-attractor-studio` (Lyapunov) | Detect which knowledge domains are stable vs. chaotic. Stable domains (negative λ) compress more aggressively. Chaotic domains (positive λ) retain more temporal deltas. | +| `temporal-neural-solver` (Certified prediction) | Predict future content drift for scheduling recrawl priority. High-confidence stability predictions → lower crawl frequency. | +| `strange-loop` (Meta-cognition) | Evaluate query relevance × memory quality for web search results. The 5ms budget per query adds meta-cognitive scoring without latency impact. | +| `nanosecond-scheduler` | Schedule background compaction, recrawl triggers, and temporal delta aggregation with nanosecond precision. | + +### 4.5 New API Endpoints + +| Method | Path | Description | +|---|---|---| +| `POST` | `/v1/web/ingest` | Submit a batch of cleaned pages for ingestion | +| `GET` | `/v1/web/search` | Semantic search across web memory (extends `/v1/search`) | +| `GET` | `/v1/web/contradictions` | Find conflicting claims across sources | +| `GET` | `/v1/web/novelty` | Detect newly emerging knowledge clusters | +| `GET` | `/v1/web/coherence` | Measure internal consistency of a cluster | +| `GET` | `/v1/web/evolution` | Temporal delta history for a URL or topic | +| `GET` | `/v1/web/graph` | Subgraph extraction around a memory or topic | +| `POST` | `/v1/web/recrawl` | Request recrawl of specific URLs | +| `GET` | `/v1/web/status` | Web memory statistics and pipeline health | +| `GET` | `/v1/web/domains` | Domain-level statistics and authority scores | + +--- + +## 5. Compression and Storage Model + +### 5.1 Storage Budget + +| Item | Per Object | At 10M Objects | At 100M Objects | +|---|---|---|---| +| Embedding (128 × f32) | 512 B | 4.8 GB | 48 GB | +| Metadata (JSON compressed) | ~200 B | 1.9 GB | 19 GB | +| Graph edges (avg 5/page) | ~120 B | 1.1 GB | 11 GB | +| Temporal deltas (avg 2/page) | ~80 B | 0.7 GB | 7.5 GB | +| Witness chain | ~82 B | 0.8 GB | 7.8 GB | +| **Total (hot)** | **~994 B** | **9.3 GB** | **93 GB** | + +With temporal compression at 60% reduction: **3.7 GB for 10M objects** in hot memory. + +### 5.2 Tiered Storage + +| Tier | Location | Latency | Retention | +|---|---|---|---| +| Hot | DashMap (in-memory) | <1ms | Active + high-quality memories | +| Warm | Firestore | ~10ms | All canonical memories with recent access | +| Cold | GCS (RVF containers) | ~100ms | Full archive, low-access memories | +| Frozen | GCS Archive class | ~1s | Historical snapshots, compacted deltas | + +### 5.3 Compaction Policy + +Background compaction runs on the nanosecond-scheduler: + +1. **Centroid merge**: Near-duplicates (cosine > 0.98) merge into cluster centroids every 6 hours +2. **Delta collapse**: Sequential minor deltas (< 5% change each) collapse into a single major delta daily +3. **Tier promotion/demotion**: Access-frequency-based movement between hot/warm/cold every hour +4. **Archive sweep**: Memories with quality_score < 0.2 after 30 days move to frozen tier + +--- + +## 6. Emergent Capabilities + +The combination of web-scale ingestion, RuVector's graph + temporal + coherence primitives, and the shared memory plane enables capabilities that go beyond conventional search or RAG: + +### 6.1 Collective Intelligence for Agent Swarms + +Agents contribute patterns, solutions, and observations into the shared substrate. Web memory provides the foundational knowledge layer that agent contributions build upon. Instead of every agent rediscovering known solutions, the web memory serves as a global baseline. + +### 6.2 Autonomous Research Engine + +Continuous research agents can: +1. Semantic retrieval across the web corpus +2. Pattern extraction via Hopfield recall +3. Contradiction detection via mincut boundaries +4. New hypothesis storage back into the graph + +### 6.3 Knowledge Cartography + +RuVector's combination of vectors + graphs + mincut boundaries + coherence scoring enables mapping knowledge topology: emerging research fields, scientific disagreements, technological trends, cross-disciplinary idea clusters. + +### 6.4 Truth Verification Infrastructure + +Claims stored with supporting sources, semantic similarity, contradiction edges, and confidence scoring enable evidence-backed reasoning. Agents retrieve not only answers but evidence graphs — aligned with proof-gated mutation (ADR-047). + +### 6.5 Knowledge Evolution Tracking + +Temporal delta tracking observes how knowledge evolves: consensus formation, misinformation propagation, idea propagation through research communities. The attractor analysis (midstream) identifies stable vs. chaotic knowledge domains. + +### 6.6 Structural Search + +Traditional search: "what pages mention this?" +π.ruv.io search: "what does humanity know about this problem?" + +The shift from document retrieval to knowledge reasoning is enabled by graph traversal + coherence analysis + temporal context. + +--- + +## 7. Cost Model + +### 7.1 Preprocessing (Local) + +| Resource | Specification | Cost | +|---|---|---| +| Mac Studio M2 Ultra | 192 GB unified memory, 24-core CPU | One-time ($3,999) | +| Storage (local NVMe) | 2 TB for WARC staging | Included | +| Power | ~50W average | ~$5/month | +| **Preprocessing throughput** | **~10K pages/hour (embed + dedupe)** | | + +### 7.2 Cloud (Google Cloud) + +| Service | Usage | Monthly Cost | +|---|---|---| +| Cloud Run (ingest API) | 2 vCPU, 4 GB, always-on | ~$65 | +| Cloud Run (query API) | Same as existing brain server | ~$65 | +| Firestore | 10M documents, 100K reads/day | ~$50 | +| GCS (RVF containers) | 50 GB Standard, 500 GB Archive | ~$15 | +| Cloud Tasks (job queue) | 1M tasks/month | ~$5 | +| **Total monthly** | | **~$200** | + +### 7.3 Scaling Projection + +| Scale | Hot Memory | Monthly Cloud | Preprocessing Time | +|---|---|---|---| +| 1M pages | ~370 MB | ~$100 | ~4 days (Mac Studio) | +| 10M pages | ~3.7 GB | ~$200 | ~6 weeks | +| 100M pages | ~37 GB | ~$500 | ~14 months | +| 1B pages | ~370 GB (tiered) | ~$2,000 | Distributed cluster | + +--- + +## 8. Implementation Phases + +### Phase 1: Data Model + Types (Week 1-2) + +**Files**: `crates/mcp-brain-server/src/web_memory.rs` (new) + +- Define `WebMemory`, `WebPageDelta`, `LinkEdge`, `CompressionTier`, `ContentDelta`, `LinkType` +- Extend `FirestoreClient` with web memory CRUD operations +- Add web-specific categories to `BrainCategory::Custom` +- Integration tests for type serialization round-trips + +### Phase 2: Ingestion Pipeline (Week 3-5) + +**Files**: `crates/mcp-brain-server/src/web_ingest.rs` (new) + +- HTML cleaning (strip boilerplate, extract content) +- Content hashing (SHAKE-256 deduplication) +- Chunking (512 tokens, 64-token overlap) +- Novelty scoring against existing HNSW index +- Compression tier assignment +- Batch ingestion endpoint (`POST /v1/web/ingest`) + +### Phase 3: Graph Construction (Week 5-7) + +**Files**: extend `crates/mcp-brain-server/src/graph.rs` + +- Link extraction and resolution +- LinkEdge creation with semantic weighting +- Contradiction detection via mincut boundary analysis +- PPR-ranked graph traversal for web subgraphs + +### Phase 4: Temporal Compression (Week 7-9) + +**Files**: extend `crates/mcp-brain-server/src/drift.rs`, new `web_temporal.rs` + +- WebPageDelta tracking across recrawls +- Compaction policy implementation on nanosecond-scheduler +- Centroid merge for near-duplicates +- Delta collapse for sequential minor changes +- Tiered storage promotion/demotion + +### Phase 5: Query APIs (Week 9-11) + +**Files**: extend `crates/mcp-brain-server/src/routes.rs` + +- `/v1/web/search` — semantic + graph-ranked web memory search +- `/v1/web/contradictions` — mincut boundary queries +- `/v1/web/novelty` — emerging cluster detection +- `/v1/web/evolution` — temporal delta history +- `/v1/web/coherence` — cluster consistency measurement +- Midstream scoring integration (attractor + solver + strange loop) + +### Phase 6: Local Preprocessing CLI (Week 11-13) + +**Files**: `crates/mcp-brain-server/src/preprocess.rs` (new), extend `npm/packages/ruvector/bin/cli.js` + +- WARC reader for Common Crawl segments +- Local HTML cleaning pipeline +- Batch embedding via ruvLLM +- Upload to Cloud Ingest API +- CLI commands: `npx ruvector web ingest`, `npx ruvector web status` + +### Phase 7: Hardening + Acceptance (Week 13-14) + +- Load testing at 10M objects +- p95 latency validation (≤50ms search, ≤100ms graph) +- Compression ratio validation (≥60%) +- Proof chain integrity audit +- Cost model validation against projections + +--- + +## 9. Invariants + +| ID | Invariant | Enforcement | +|---|---|---| +| INV-1 | Every WebMemory has a non-empty provenance chain | `build_rvf_container` requires witness_chain | +| INV-2 | Content hash is unique per Full-tier memory | SHAKE-256 dedup check before store | +| INV-3 | Agent writes are proof-gated | ProofGate wraps mutation path | +| INV-4 | Temporal deltas reference valid parent memories | Foreign key check on previous_memory_id | +| INV-5 | Compression tier assignment matches novelty score | Tier = f(novelty_score) is deterministic | +| INV-6 | LinkEdge weights are in [0.0, 1.0] | Clamped at construction | +| INV-7 | Hot memory fits within configured budget | Compaction triggers when hot tier exceeds threshold | +| INV-8 | Contradiction edges are symmetric | If A contradicts B, B contradicts A | + +--- + +## 10. Security Considerations + +### 10.1 Inherited from ADR-059 + +All seven security layers from the Shared Brain apply: +1. Input sanitization (PII strip) +2. Differential privacy (ε=1.0, δ=1e-5) +3. Ed25519 signatures +4. Witness chains (SHAKE-256) +5. Byzantine-tolerant aggregation +6. Rate limiting (BudgetTokenBucket) +7. Reputation-gated writes + +### 10.2 Web-Specific Threats + +| Threat | Mitigation | +|---|---| +| SEO spam injection | Quality gating: novelty < 0.05 + low authority domain → reject | +| Link manipulation | PageRank-style authority scoring via ruvector-solver PPR | +| Content poisoning | Contradiction detection flags conflicting claims for review | +| Copyright claims | robots.txt compliance; only store embeddings + structured metadata, not raw content | +| Crawl budget abuse | Rate limiting on ingest API; batch size caps | + +--- + +## 11. Alternatives Considered + +### 11.1 Use Common Crawl as LLM Training Data + +**Rejected**: Training produces a static model that cannot be queried structurally, lacks provenance, and requires expensive retraining. The retrieval-first approach preserves auditability and supports continuous evolution. + +### 11.2 Use a Managed Vector Database (Pinecone, Weaviate) + +**Rejected**: External managed services add latency, cost, and vendor lock-in. They lack graph structure, temporal compression, mincut coherence, and proof-gated mutation. RuVector provides all of these natively. + +### 11.3 Store Raw HTML in Object Storage + +**Rejected**: Raw HTML storage is expensive at scale and provides no semantic structure. The cleaning + embedding + graph construction pipeline is essential for the platform to be useful as a knowledge substrate rather than a web archive. + +### 11.4 Build a Separate Ingestion Service + +**Rejected**: The ingestion pipeline shares types, embedding infrastructure, graph construction, and proof verification with the existing brain server. A separate service would duplicate these dependencies. Instead, web memory is implemented as an extension module within `mcp-brain-server`. + +--- + +## 12. Future Directions + +1. **Developer Platform API**: Expose π.ruv.io as a shared memory service for any agent framework, not just Claude Code sessions +2. **Automatic Contradiction Detection**: Use dynamic mincut + coherence scoring to detect contradictions and emerging ideas automatically +3. **Cognitum Edge Nodes**: Deploy compressed web memory subsets to edge devices (ADR-040) for offline agent reasoning +4. **RVF Knowledge Export**: Package web memory clusters as RVF cognitive containers (ADR-056) for transfer between deployments +5. **Cross-Domain Transfer Learning**: Use web memory as a foundation for domain expansion (ADR-068) across specialized agent populations + +--- + +## 13. Acceptance Criteria + +| Criterion | Target | Measurement | +|---|---|---| +| Ingestion throughput | ≥1K pages/second (cloud batch) | Load test with 100K page batch | +| Storage compression | ≥60% reduction vs. raw embeddings | Compare raw vs. compressed storage at 1M objects | +| Search latency (p95) | ≤50ms for semantic search | Benchmark with 10M objects, 100 concurrent queries | +| Graph traversal (p95) | ≤100ms for 2-hop subgraph | Benchmark with 10M nodes, 50M edges | +| Provenance coverage | 100% of objects have witness chains | Audit query: count objects without witness_chain | +| Contradiction detection | ≥80% precision on labeled test set | Manual evaluation of 500 flagged contradictions | +| Cost | ≤$500/month at 10M objects | Monthly billing review | +| Proof-gate coverage | Zero unprovenanced writes | Audit log analysis | + +--- + +## 14. References + +- ADR-017: Temporal Tensor Compression with Tiered Quantization +- ADR-030: RVF Computational Container +- ADR-040: Cognitum Swarm +- ADR-047: Proof-Gated Mutation Protocol +- ADR-058: MCP Tool Groups +- ADR-059: Shared Brain — Google Cloud Deployment +- ADR-060: Shared Brain Capabilities — Federated MicroLoRA Intelligence Substrate +- ADR-077: Midstream Platform Integration +- ADR-091: INT8 CNN Quantization +- Common Crawl: https://commoncrawl.org/ +- RuVector compression and temporal tiering architecture +- Contrastive AI framing and proof-gated mutation principles