diff --git a/examples/rvf/Cargo.toml b/examples/rvf/Cargo.toml index 5ef61061..618859b0 100644 --- a/examples/rvf/Cargo.toml +++ b/examples/rvf/Cargo.toml @@ -269,6 +269,18 @@ path = "examples/genomic_graphcut.rs" name = "supply_chain_graphcut" path = "examples/supply_chain_graphcut.rs" +[[example]] +name = "financial_fraud_graphcut" +path = "examples/financial_fraud_graphcut.rs" + [[example]] name = "openfang" path = "examples/openfang.rs" + +[[example]] +name = "cyber_threat_graphcut" +path = "examples/cyber_threat_graphcut.rs" + +[[example]] +name = "climate_graphcut" +path = "examples/climate_graphcut.rs" diff --git a/examples/rvf/examples/climate_graphcut.rs b/examples/rvf/examples/climate_graphcut.rs new file mode 100644 index 00000000..5532c8c1 --- /dev/null +++ b/examples/rvf/examples/climate_graphcut.rs @@ -0,0 +1,574 @@ +//! Climate Anomaly Detection via Graph Cut / MRF + RuVector +//! +//! Applies MRF/mincut optimization to environmental monitoring data: +//! 1. Generate synthetic station grid (30x40 = 1200 monitoring stations) +//! 2. Inject anomalies: heat waves, pollution spikes, drought, ocean warming, +//! cold snaps, sensor faults +//! 3. Extract 32-dim embeddings, build spatial+similarity graph, solve mincut +//! 4. Store station embeddings in RVF with climate zone metadata +//! 5. Evaluate: precision, recall, F1, per-event detection rates +//! +//! Run: cargo run --example climate_graphcut --release + +use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore}; +use rvf_runtime::filter::FilterValue; +use rvf_runtime::options::DistanceMetric; +use rvf_types::DerivationType; +use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry}; +use tempfile::TempDir; + +const ROWS: usize = 30; +const COLS: usize = 40; +const N_STATIONS: usize = ROWS * COLS; +const DIM: usize = 32; + +const FIELD_REGION: u16 = 0; +const FIELD_ELEV_CLASS: u16 = 1; +const FIELD_CLIMATE_ZONE: u16 = 2; +const FIELD_ANOMALY_TYPE: u16 = 3; + +fn lcg_next(s: &mut u64) -> u64 { + *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s +} +fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) } +fn lcg_normal(s: &mut u64) -> f64 { + let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s); + (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos() +} + +#[derive(Debug, Clone, Copy, PartialEq)] +enum AnomalyType { Normal, HeatWave, PollutionSpike, Drought, OceanWarming, ColdSnap, SensorFault } +impl AnomalyType { + fn label(&self) -> &'static str { + match self { + Self::Normal => "normal", Self::HeatWave => "heat_wave", + Self::PollutionSpike => "pollution", Self::Drought => "drought", + Self::OceanWarming => "ocean_warm", Self::ColdSnap => "cold_snap", + Self::SensorFault => "sensor_fault", + } + } +} + +#[derive(Debug, Clone)] +struct Station { + row: usize, col: usize, + lat: f64, lon: f64, elevation: f64, + temperature: f64, humidity: f64, precipitation: f64, + wind_speed: f64, aqi: f64, co2: f64, + sst: f64, ndvi: f64, day_of_year: f64, + is_coastal: bool, truth: AnomalyType, +} + +fn climate_zone(lat: f64) -> &'static str { + let a = lat.abs(); + if a > 60.0 { "polar" } else if a > 40.0 { "temperate" } else if a > 23.5 { "subtropical" } + else { "tropical" } +} + +fn elev_class(e: f64) -> &'static str { + if e < 200.0 { "lowland" } else if e < 1000.0 { "highland" } else { "mountain" } +} + +fn region_label(row: usize, col: usize) -> &'static str { + match (row < ROWS / 2, col < COLS / 2) { + (true, true) => "NW", (true, false) => "NE", + (false, true) => "SW", _ => "SE", + } +} + +fn generate_stations(seed: u64, day: f64) -> Vec { + let mut rng = seed; + let mut stations = Vec::with_capacity(N_STATIONS); + let season = (day / 365.0 * 2.0 * std::f64::consts::PI).sin(); + let season_cos = (day / 365.0 * 2.0 * std::f64::consts::PI).cos(); + + // Pre-generate anomaly clusters (spatial events) + let n_heat = 2; let n_poll = 2; let n_drought = 1; let n_cold = 1; + let mut heat_centers = Vec::new(); + for _ in 0..n_heat { + heat_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64, + 3.0 + lcg_f64(&mut rng) * 4.0)); + } + let mut poll_centers = Vec::new(); + for _ in 0..n_poll { + poll_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64, + 2.0 + lcg_f64(&mut rng) * 3.0)); + } + let mut drought_centers = Vec::new(); + for _ in 0..n_drought { + drought_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64, + 4.0 + lcg_f64(&mut rng) * 5.0)); + } + let mut cold_centers = Vec::new(); + for _ in 0..n_cold { + cold_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64, + 3.0 + lcg_f64(&mut rng) * 3.0)); + } + + for r in 0..ROWS { + for c in 0..COLS { + let lat = 25.0 + (r as f64 / ROWS as f64) * 50.0; // 25N to 75N + let lon = -125.0 + (c as f64 / COLS as f64) * 60.0; // 125W to 65W + let elevation = (200.0 + lcg_normal(&mut rng) * 300.0 + + 800.0 * ((r as f64 * 0.2).sin() * (c as f64 * 0.15).cos()).abs()).max(0.0); + let is_coastal = c < 3 || c >= COLS - 3 || r < 2 || r >= ROWS - 2; + + // Seasonal baseline temperature: warmer at lower latitudes, summer peak + let t_base = 30.0 - 0.6 * (lat - 25.0) + 12.0 * season - elevation * 0.006; + let temperature = t_base + lcg_normal(&mut rng) * 3.0; + let humidity = (60.0 + 15.0 * season_cos + lcg_normal(&mut rng) * 10.0).clamp(10.0, 100.0); + let precipitation = (20.0 + 30.0 * (0.5 + 0.5 * season_cos) + + lcg_normal(&mut rng) * 15.0).max(0.0); + let wind_speed = (5.0 + lcg_normal(&mut rng) * 3.0).max(0.5); + let aqi = (35.0 + lcg_normal(&mut rng) * 15.0).clamp(0.0, 500.0); + let co2 = 420.0 + lcg_normal(&mut rng) * 5.0; + let sst = if is_coastal { + 15.0 + 5.0 * season - 0.2 * (lat - 40.0) + lcg_normal(&mut rng) * 1.5 + } else { 0.0 }; + let ndvi = (0.5 + 0.15 * season + lcg_normal(&mut rng) * 0.1 + - elevation * 0.0001).clamp(0.0, 1.0); + + let mut truth = AnomalyType::Normal; + + // Check spatial anomaly clusters + for &(cr, cc, rad) in &heat_centers { + let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt(); + if d <= rad { truth = AnomalyType::HeatWave; } + } + for &(cr, cc, rad) in &poll_centers { + let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt(); + if d <= rad { truth = AnomalyType::PollutionSpike; } + } + for &(cr, cc, rad) in &drought_centers { + let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt(); + if d <= rad { truth = AnomalyType::Drought; } + } + for &(cr, cc, rad) in &cold_centers { + let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt(); + if d <= rad { truth = AnomalyType::ColdSnap; } + } + // Ocean warming: coastal belt anomaly + if truth == AnomalyType::Normal && is_coastal && lat < 45.0 && lcg_f64(&mut rng) < 0.3 { + truth = AnomalyType::OceanWarming; + } + // Sensor fault: rare random + if truth == AnomalyType::Normal && lcg_f64(&mut rng) < 0.005 { + truth = AnomalyType::SensorFault; + } + + // Apply anomaly effects + let (temperature, humidity, precipitation, aqi, co2, sst, ndvi) = match truth { + AnomalyType::HeatWave => ( + temperature + 8.0 + lcg_normal(&mut rng) * 2.0, + (humidity - 20.0).max(10.0), (precipitation * 0.2).max(0.0), + aqi + 30.0, co2 + 10.0, sst, (ndvi - 0.15).max(0.0)), + AnomalyType::PollutionSpike => ( + temperature, humidity,precipitation, + (180.0 + lcg_f64(&mut rng) * 200.0).min(500.0), + co2 + 25.0 + lcg_normal(&mut rng) * 10.0, sst, ndvi), + AnomalyType::Drought => ( + temperature + 4.0, (humidity - 30.0).max(10.0), + (precipitation * 0.05).max(0.0), aqi + 15.0, co2, + sst, (ndvi - 0.25).max(0.0)), + AnomalyType::OceanWarming => ( + temperature + 2.0, humidity, precipitation, aqi, co2, + sst + 3.0 + lcg_normal(&mut rng) * 0.5, ndvi), + AnomalyType::ColdSnap => ( + temperature - 15.0 - lcg_f64(&mut rng) * 5.0, + humidity + 10.0, precipitation + 5.0, + aqi, co2, sst, (ndvi - 0.1).max(0.0)), + AnomalyType::SensorFault => ( + -80.0 + lcg_f64(&mut rng) * 160.0, + lcg_f64(&mut rng) * 200.0, + lcg_f64(&mut rng) * 500.0, + lcg_f64(&mut rng) * 500.0, + lcg_f64(&mut rng) * 1000.0, + if is_coastal { lcg_f64(&mut rng) * 50.0 } else { 0.0 }, + lcg_f64(&mut rng)), + AnomalyType::Normal => (temperature, humidity, precipitation, aqi, co2, sst, ndvi), + }; + + stations.push(Station { + row: r, col: c, lat, lon, elevation, temperature, humidity, + precipitation, wind_speed, aqi, co2, sst, ndvi, day_of_year: day, + is_coastal, truth, + }); + } + } + stations +} + +fn extract_embedding(st: &Station, stats: &StationStats) -> Vec { + let t_anom = (st.temperature - stats.t_mean) / stats.t_std.max(1e-6); + let h_z = (st.humidity - stats.h_mean) / stats.h_std.max(1e-6); + let p_z = (st.precipitation - stats.p_mean) / stats.p_std.max(1e-6); + let w_z = (st.wind_speed - stats.w_mean) / stats.w_std.max(1e-6); + let aqi_log = (st.aqi + 1.0).ln(); + let co2_dev = (st.co2 - 420.0) / 20.0; + let sst_anom = if st.is_coastal { (st.sst - stats.sst_mean) / stats.sst_std.max(1e-6) } else { 0.0 }; + let ndvi_dev = (st.ndvi - stats.ndvi_mean) / stats.ndvi_std.max(1e-6); + let lat_sin = (st.lat * std::f64::consts::PI / 180.0).sin(); + let lat_cos = (st.lat * std::f64::consts::PI / 180.0).cos(); + let lon_sin = (st.lon * std::f64::consts::PI / 180.0).sin(); + let lon_cos = (st.lon * std::f64::consts::PI / 180.0).cos(); + let elev_log = (st.elevation + 1.0).ln() / 8.0; + let season_sin = (st.day_of_year / 365.0 * 2.0 * std::f64::consts::PI).sin(); + let season_cos = (st.day_of_year / 365.0 * 2.0 * std::f64::consts::PI).cos(); + + vec![ + t_anom as f32, h_z as f32, p_z as f32, w_z as f32, // 0-3: z-scores + aqi_log as f32, co2_dev as f32, sst_anom as f32, ndvi_dev as f32, // 4-7: derived + lat_sin as f32, lat_cos as f32, lon_sin as f32, lon_cos as f32, // 8-11: position + elev_log as f32, season_sin as f32, season_cos as f32, // 12-14: context + (t_anom.abs() + h_z.abs()) as f32, // 15: temp+humidity + (t_anom * p_z) as f32, // 16: temp*precip + (aqi_log * co2_dev) as f32, // 17: aqi*co2 + (t_anom * ndvi_dev) as f32, // 18: temp*ndvi + if st.aqi > 150.0 { 1.0 } else { 0.0 }, // 19: pollution flag + if t_anom > 2.0 { 1.0 } else if t_anom < -2.0 { -1.0 } else { 0.0 }, // 20: temp flag + (t_anom * t_anom) as f32, // 21: t^2 + (h_z * h_z) as f32, // 22: h^2 + (p_z * p_z) as f32, // 23: p^2 + (sst_anom * sst_anom) as f32, // 24: sst^2 + (t_anom.abs() * aqi_log) as f32, // 25: |t|*aqi + (p_z * ndvi_dev) as f32, // 26: precip*ndvi + (co2_dev * t_anom) as f32, // 27: co2*t + if st.is_coastal { 1.0 } else { 0.0 }, // 28: coastal flag + (st.temperature / 50.0).clamp(-1.0, 1.0) as f32, // 29: raw temp norm + (st.aqi / 500.0) as f32, // 30: raw aqi norm + (st.ndvi) as f32, // 31: raw ndvi + ] +} + +struct StationStats { + t_mean: f64, t_std: f64, h_mean: f64, h_std: f64, + p_mean: f64, p_std: f64, w_mean: f64, w_std: f64, + sst_mean: f64, sst_std: f64, ndvi_mean: f64, ndvi_std: f64, +} + +fn compute_stats(stations: &[Station]) -> StationStats { + let n = stations.len() as f64; + let mean = |f: &dyn Fn(&Station) -> f64| stations.iter().map(f).sum::() / n; + let std = |f: &dyn Fn(&Station) -> f64, m: f64| + (stations.iter().map(|s| (f(s) - m).powi(2)).sum::() / n).sqrt(); + let tm = mean(&|s| s.temperature); let hm = mean(&|s| s.humidity); + let pm = mean(&|s| s.precipitation); let wm = mean(&|s| s.wind_speed); + let coastal: Vec<&Station> = stations.iter().filter(|s| s.is_coastal).collect(); + let cn = coastal.len().max(1) as f64; + let sm = coastal.iter().map(|s| s.sst).sum::() / cn; + let ss = (coastal.iter().map(|s| (s.sst - sm).powi(2)).sum::() / cn).sqrt(); + let nm = mean(&|s| s.ndvi); + StationStats { + t_mean: tm, t_std: std(&|s| s.temperature, tm), + h_mean: hm, h_std: std(&|s| s.humidity, hm), + p_mean: pm, p_std: std(&|s| s.precipitation, pm), + w_mean: wm, w_std: std(&|s| s.wind_speed, wm), + sst_mean: sm, sst_std: ss, + ndvi_mean: nm, ndvi_std: std(&|s| s.ndvi, nm), + } +} + +fn unary_score(st: &Station, stats: &StationStats) -> f64 { + let t_z = ((st.temperature - stats.t_mean) / stats.t_std.max(1e-6)).abs(); + let h_z = ((st.humidity - stats.h_mean) / stats.h_std.max(1e-6)).abs(); + let p_z = ((st.precipitation - stats.p_mean) / stats.p_std.max(1e-6)).abs(); + let aqi_f = if st.aqi > 100.0 { (st.aqi - 100.0) / 100.0 } else { 0.0 }; + let co2_f = ((st.co2 - 420.0).abs() / 20.0).max(0.0); + let sst_z = if st.is_coastal { + ((st.sst - stats.sst_mean) / stats.sst_std.max(1e-6)).abs() + } else { 0.0 }; + let ndvi_z = ((st.ndvi - stats.ndvi_mean) / stats.ndvi_std.max(1e-6)).abs(); + 0.3 * t_z + 0.15 * h_z + 0.1 * p_z + 0.2 * aqi_f + 0.1 * co2_f + + 0.15 * sst_z + 0.1 * ndvi_z - 1.2 +} + +fn cosine_sim(a: &[f32], b: &[f32]) -> f64 { + let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64); + for i in 0..a.len().min(b.len()) { + d += a[i] as f64 * b[i] as f64; + na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2); + } + let dn = na.sqrt() * nb.sqrt(); + if dn < 1e-15 { 0.0 } else { d / dn } +} + +struct Edge { from: usize, to: usize, weight: f64 } + +fn build_graph(stations: &[Station], embs: &[Vec], alpha: f64, beta: f64, k: usize) -> Vec { + let mut edges = Vec::new(); + // 4-connected spatial adjacency + for r in 0..ROWS { for c in 0..COLS { + let idx = r * COLS + c; + for &(dr, dc) in &[(0i32, 1i32), (1, 0)] { + let (nr, nc) = (r as i32 + dr, c as i32 + dc); + if nr >= ROWS as i32 || nc >= COLS as i32 { continue; } + let nidx = nr as usize * COLS + nc as usize; + let dist = ((stations[idx].lat - stations[nidx].lat).powi(2) + + (stations[idx].lon - stations[nidx].lon).powi(2)).sqrt(); + let grad = (stations[idx].temperature - stations[nidx].temperature).abs() + + (stations[idx].aqi - stations[nidx].aqi).abs() * 0.01; + let w = alpha * (1.0 / (1.0 + dist)) * (-grad * 0.05).exp(); + edges.push(Edge { from: idx, to: nidx, weight: w }); + edges.push(Edge { from: nidx, to: idx, weight: w }); + } + }} + // kNN similarity edges from embeddings + for i in 0..embs.len() { + let mut sims: Vec<(usize, f64)> = (0..embs.len()) + .filter(|&j| { + let (ri, ci) = (i / COLS, i % COLS); + let (rj, cj) = (j / COLS, j % COLS); + (ri as isize - rj as isize).unsigned_abs() + (ci as isize - cj as isize).unsigned_abs() > 2 + }) + .map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0))).collect(); + sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + for &(j, s) in sims.iter().take(k) { + if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); } + } + } + edges +} + +fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec { + let m = lambdas.len(); + let (s, t, n) = (m, m + 1, m + 2); + let mut adj: Vec> = vec![Vec::new(); n]; + let mut caps: Vec = Vec::new(); + let ae = |adj: &mut Vec>, caps: &mut Vec, u: usize, v: usize, c: f64| { + let idx = caps.len(); caps.push(c); caps.push(0.0); + adj[u].push((v, idx)); adj[v].push((u, idx + 1)); + }; + for i in 0..m { + let p0 = lambdas[i].max(0.0); let p1 = (-lambdas[i]).max(0.0); + if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); } + if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); } + } + for e in edges { + let c = gamma * e.weight; + if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); } + } + loop { + let mut par: Vec> = vec![None; n]; + let mut vis = vec![false; n]; + let mut q = std::collections::VecDeque::new(); + vis[s] = true; q.push_back(s); + while let Some(u) = q.pop_front() { + if u == t { break; } + for &(v, ei) in &adj[u] { + if !vis[v] && caps[ei] > 1e-15 { vis[v] = true; par[v] = Some((u, ei)); q.push_back(v); } + } + } + if !vis[t] { break; } + let mut bn = f64::MAX; + let mut v = t; + while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; } + v = t; + while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; } + } + let mut reach = vec![false; n]; + let mut stk = vec![s]; reach[s] = true; + while let Some(u) = stk.pop() { + for &(v, ei) in &adj[u] { + if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); } + } + } + (0..m).map(|i| reach[i]).collect() +} + +fn threshold_detect(stations: &[Station], stats: &StationStats) -> Vec { + stations.iter().map(|st| { + let t_z = ((st.temperature - stats.t_mean) / stats.t_std.max(1e-6)).abs(); + let aqi_hi = st.aqi > 150.0; + let co2_hi = (st.co2 - 420.0).abs() > 40.0; + let ndvi_lo = st.ndvi < stats.ndvi_mean - 2.5 * stats.ndvi_std; + t_z > 3.0 || aqi_hi || co2_hi || ndvi_lo + }).collect() +} + +struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 } + +fn evaluate(stations: &[Station], pred: &[bool]) -> Metrics { + let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0); + for (i, st) in stations.iter().enumerate() { + let truth = st.truth != AnomalyType::Normal; + match (truth, pred[i]) { + (true, true) => tp += 1, (false, true) => fp += 1, + (true, false) => fn_ += 1, (false, false) => tn += 1, + } + } + let prec = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 }; + let rec = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 }; + let f1 = if prec + rec > 0.0 { 2.0 * prec * rec / (prec + rec) } else { 0.0 }; + let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 }; + Metrics { precision: prec, recall: rec, f1, fpr } +} + +fn per_event_detection(stations: &[Station], pred: &[bool]) -> Vec<(&'static str, usize, usize)> { + let types = [ + AnomalyType::HeatWave, AnomalyType::PollutionSpike, AnomalyType::Drought, + AnomalyType::OceanWarming, AnomalyType::ColdSnap, AnomalyType::SensorFault, + ]; + types.iter().map(|&at| { + let total = stations.iter().filter(|s| s.truth == at).count(); + let detected = stations.iter().enumerate() + .filter(|(i, s)| s.truth == at && pred[*i]).count(); + (at.label(), detected, total) + }).collect() +} + +fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() } + +fn main() { + println!("=== Climate Anomaly Detection via Graph Cut / MRF ===\n"); + let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.5, 3usize); + let days = [172.0, 355.0]; // summer solstice, late December + let day_labels = ["Summer", "Winter"]; + + let tmp = TempDir::new().expect("tmpdir"); + let opts = RvfOptions { dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default() }; + let mut store = RvfStore::create(&tmp.path().join("climate.rvenv"), opts).expect("create"); + + println!(" Grid: {}x{} = {} stations | Dim: {} | alpha={} beta={} gamma={} k={}\n", + ROWS, COLS, N_STATIONS, DIM, alpha, beta, gamma, k_nn); + + let (mut all_vecs, mut all_ids, mut all_meta): (Vec>, Vec, Vec) = + (Vec::new(), Vec::new(), Vec::new()); + + for (si, &day) in days.iter().enumerate() { + let seed = 42 + si as u64 * 7919; + let stations = generate_stations(seed, day); + let n_anom = stations.iter().filter(|s| s.truth != AnomalyType::Normal).count(); + println!("--- Season: {} (day {}) ---", day_labels[si], day as u32); + println!(" {} stations, {} anomalous ({:.1}%)", N_STATIONS, n_anom, + n_anom as f64 / N_STATIONS as f64 * 100.0); + for at in &[AnomalyType::HeatWave, AnomalyType::PollutionSpike, AnomalyType::Drought, + AnomalyType::OceanWarming, AnomalyType::ColdSnap, AnomalyType::SensorFault] { + let c = stations.iter().filter(|s| s.truth == *at).count(); + if c > 0 { println!(" {}: {}", at.label(), c); } + } + + let stats = compute_stats(&stations); + let embs: Vec> = stations.iter().map(|s| extract_embedding(s, &stats)).collect(); + let lam: Vec = stations.iter().map(|s| unary_score(s, &stats)).collect(); + let edges = build_graph(&stations, &embs, alpha, beta, k_nn); + let gc = solve_mincut(&lam, &edges, gamma); + let tc = threshold_detect(&stations, &stats); + + let gc_m = evaluate(&stations, &gc); + let tc_m = evaluate(&stations, &tc); + println!("\n {:>12} {:>8} {:>8} {:>8} {:>8}", "Method", "Prec", "Recall", "F1", "FPR"); + println!(" {:->12} {:->8} {:->8} {:->8} {:->8}", "", "", "", "", ""); + println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", + "Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr); + println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", + "Threshold", tc_m.precision, tc_m.recall, tc_m.f1, tc_m.fpr); + + println!("\n Per-event detection (Graph Cut):"); + println!(" {:>14} {:>6} {:>6} {:>8}", "Event", "Det", "Total", "Rate"); + println!(" {:->14} {:->6} {:->6} {:->8}", "", "", "", ""); + for (label, det, total) in per_event_detection(&stations, &gc) { + if total > 0 { + println!(" {:>14} {:>6} {:>6} {:>7.1}%", label, det, total, + det as f64 / total as f64 * 100.0); + } + } + + // Spatial coherence: count contiguous anomaly regions in graph cut + let mut visited = vec![false; N_STATIONS]; + let mut n_regions = 0usize; + for i in 0..N_STATIONS { + if gc[i] && !visited[i] { + n_regions += 1; + let mut stk = vec![i]; + while let Some(u) = stk.pop() { + if visited[u] { continue; } + visited[u] = true; + let (ur, uc) = (u / COLS, u % COLS); + for &(dr, dc) in &[(-1i32, 0i32), (1, 0), (0, -1), (0, 1)] { + let (nr, nc) = (ur as i32 + dr, uc as i32 + dc); + if nr >= 0 && nr < ROWS as i32 && nc >= 0 && nc < COLS as i32 { + let ni = nr as usize * COLS + nc as usize; + if gc[ni] && !visited[ni] { stk.push(ni); } + } + } + } + } + } + let gc_count = gc.iter().filter(|&&x| x).count(); + println!("\n Spatial coherence: {} anomaly regions ({} flagged stations)", + n_regions, gc_count); + + // Store embeddings with metadata + for (i, emb) in embs.iter().enumerate() { + let id = si as u64 * 100_000 + i as u64; + all_vecs.push(emb.clone()); all_ids.push(id); + all_meta.push(MetadataEntry { field_id: FIELD_REGION, + value: MetadataValue::String(region_label(stations[i].row, stations[i].col).into()) }); + all_meta.push(MetadataEntry { field_id: FIELD_ELEV_CLASS, + value: MetadataValue::String(elev_class(stations[i].elevation).into()) }); + all_meta.push(MetadataEntry { field_id: FIELD_CLIMATE_ZONE, + value: MetadataValue::String(climate_zone(stations[i].lat).into()) }); + all_meta.push(MetadataEntry { field_id: FIELD_ANOMALY_TYPE, + value: MetadataValue::String(stations[i].truth.label().into()) }); + } + println!(); + } + + // RVF ingestion + println!("--- RVF Ingestion ---"); + let refs: Vec<&[f32]> = all_vecs.iter().map(|v| v.as_slice()).collect(); + let ing = store.ingest_batch(&refs, &all_ids, Some(&all_meta)).expect("ingest"); + println!(" Ingested: {} embeddings (rejected: {})", ing.accepted, ing.rejected); + + // Filtered queries + println!("\n--- RVF Queries ---"); + let qv = all_vecs[0].clone(); + for zone in &["polar", "temperate", "subtropical", "tropical"] { + let f = FilterExpr::Eq(FIELD_CLIMATE_ZONE, FilterValue::String(zone.to_string())); + let r = store.query(&qv, 5, &QueryOptions { filter: Some(f), ..Default::default() }).expect("q"); + println!(" {:<12} -> {} results (top dist: {:.4})", zone, r.len(), + r.first().map(|x| x.distance).unwrap_or(0.0)); + } + let fa = FilterExpr::Eq(FIELD_ANOMALY_TYPE, FilterValue::String("heat_wave".into())); + let hr = store.query(&qv, 5, &QueryOptions { filter: Some(fa), ..Default::default() }).expect("q"); + println!(" heat_wave -> {} results", hr.len()); + + // Witness chain for environmental reporting audit + println!("\n--- Witness Chain (Environmental Audit) ---"); + let steps = [ + ("genesis", 0x01u8), ("observation", 0x01), ("qc_check", 0x02), + ("normalize", 0x02), ("embed", 0x02), ("spatial_graph", 0x02), + ("mincut", 0x02), ("classify", 0x02), ("per_event_eval", 0x02), + ("alert", 0x01), ("rvf_ingest", 0x08), ("similarity", 0x02), + ("lineage", 0x01), ("seal", 0x01), + ]; + let entries: Vec = steps.iter().enumerate().map(|(i, (s, wt))| WitnessEntry { + prev_hash: [0u8; 32], + action_hash: shake256_256(format!("climate_gc:{}:{}", s, i).as_bytes()), + timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000, + witness_type: *wt, + }).collect(); + let chain = create_witness_chain(&entries); + let ver = verify_witness_chain(&chain).expect("verify"); + println!(" {} entries, {} bytes, VALID", ver.len(), chain.len()); + for (i, (s, _)) in steps.iter().enumerate() { + let wn = match ver[i].witness_type { 0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????" }; + println!(" [{:>4}] {:>2} -> {}", wn, i, s); + } + + // Lineage + println!("\n--- Lineage ---"); + let child = store.derive(&tmp.path().join("climate_report.rvenv"), + DerivationType::Filter, None).expect("derive"); + println!(" parent: {} -> child: {} (depth {})", + hex(store.file_id()), hex(child.parent_id()), child.lineage_depth()); + child.close().expect("close"); + + // Summary + println!("\n=== Summary ==="); + println!(" {} stations x {} seasons = {} embeddings", N_STATIONS, days.len(), ing.accepted); + println!(" Anomaly types: 6 | Witness: {} steps | Climate zones: 4", ver.len()); + println!(" alpha={:.2} beta={:.2} gamma={:.2} k={}", alpha, beta, gamma, k_nn); + store.close().expect("close"); + println!("\nDone."); +} diff --git a/examples/rvf/examples/cyber_threat_graphcut.rs b/examples/rvf/examples/cyber_threat_graphcut.rs new file mode 100644 index 00000000..d28822ca --- /dev/null +++ b/examples/rvf/examples/cyber_threat_graphcut.rs @@ -0,0 +1,616 @@ +//! Cybersecurity Network Threat Detection via Graph Cut + RuVector +//! +//! MRF/mincut optimization for network intrusion detection: +//! 1. Generate ~2000 synthetic network flows (24h period, ~4% attack rate) +//! 2. Inject realistic attack patterns: PortScan, BruteForce, Exfiltration, +//! C2Beacon, DDoS, LateralMovement (CICIDS2017/NSL-KDD inspired) +//! 3. Extract 32-dim flow embeddings, build temporal+similarity graph, mincut +//! 4. Evaluate: precision, recall, F1, FPR; per-attack detection rates +//! +//! Run: cargo run --example cyber_threat_graphcut --release + +use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore}; +use rvf_runtime::filter::FilterValue; +use rvf_runtime::options::DistanceMetric; +use rvf_types::DerivationType; +use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry}; +use tempfile::TempDir; + +const DIM: usize = 32; +const N_FLOWS: usize = 2000; +const FIELD_PROTOCOL: u16 = 0; +const FIELD_PORT_CAT: u16 = 1; +const FIELD_SUBNET: u16 = 2; +const FIELD_ATTACK: u16 = 3; + +fn lcg_next(s: &mut u64) -> u64 { + *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s +} +fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) } +fn lcg_normal(s: &mut u64) -> f64 { + let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s); + (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos() +} + +#[derive(Debug, Clone, Copy, PartialEq)] +enum Attack { Normal, PortScan, BruteForce, Exfiltration, C2Beacon, DDoS, LateralMovement } +impl Attack { + fn label(&self) -> &'static str { + match self { + Attack::Normal => "normal", Attack::PortScan => "portscan", + Attack::BruteForce => "bruteforce", Attack::Exfiltration => "exfil", + Attack::C2Beacon => "c2beacon", Attack::DDoS => "ddos", + Attack::LateralMovement => "lateral", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +enum Protocol { Tcp, Udp, Icmp } +impl Protocol { + fn label(&self) -> &'static str { + match self { Protocol::Tcp => "TCP", Protocol::Udp => "UDP", Protocol::Icmp => "ICMP" } + } +} + +fn port_category(port: u16) -> &'static str { + match port { + 80 | 443 | 8080 | 8443 => "web", 22 => "ssh", 53 => "dns", + 3389 => "rdp", 0 => "none", _ => "other", + } +} + +#[derive(Debug, Clone)] +struct Flow { + index: usize, + time_s: f64, // seconds into 24h window + duration_ms: f64, + bytes_sent: u64, + bytes_recv: u64, + pkts_sent: u32, + pkts_recv: u32, + protocol: Protocol, + dst_port: u16, + src_subnet: u8, // 10.x.0.0/16 subnet id (0-4 internal) + dst_subnet: u8, + is_internal_dst: bool, + syn_flag: bool, + ack_flag: bool, + rst_flag: bool, + psh_flag: bool, + fin_flag: bool, + payload_entropy: f64, // 0-8 bits + truth: Attack, +} + +fn generate_flows(n: usize, seed: u64) -> Vec { + let mut rng = seed; + let mut flows = Vec::with_capacity(n); + let ports = [80u16, 443, 22, 53, 3389, 8080, 8443, 25, 110, 993, 3306, 5432]; + + for i in 0..n { + let time_s = lcg_f64(&mut rng) * 86400.0; + let hour = (time_s / 3600.0) as u32; + // More traffic during business hours + let biz_mult = if (8..18).contains(&hour) { 1.0 } else { 0.4 }; + let _ = biz_mult; // used implicitly via attack injection timing + + let proto_r = lcg_f64(&mut rng); + let protocol = if proto_r < 0.75 { Protocol::Tcp } + else if proto_r < 0.92 { Protocol::Udp } + else { Protocol::Icmp }; + + let port_idx = (lcg_next(&mut rng) % ports.len() as u64) as usize; + let dst_port = if protocol == Protocol::Icmp { 0 } else { ports[port_idx] }; + let src_subnet = (lcg_next(&mut rng) % 5) as u8; + let dst_r = lcg_f64(&mut rng); + let is_internal = dst_r < 0.3; + let dst_subnet = if is_internal { (lcg_next(&mut rng) % 5) as u8 } else { 100 + (lcg_next(&mut rng) % 50) as u8 }; + + let duration_ms = (50.0 + lcg_f64(&mut rng) * 2000.0 + lcg_normal(&mut rng).abs() * 500.0).max(1.0); + let bytes_sent = (200.0 + lcg_f64(&mut rng) * 5000.0 + lcg_normal(&mut rng).abs() * 1000.0).max(40.0) as u64; + let bytes_recv = (500.0 + lcg_f64(&mut rng) * 20000.0 + lcg_normal(&mut rng).abs() * 3000.0).max(40.0) as u64; + let pkts_sent = (1.0 + bytes_sent as f64 / (400.0 + lcg_f64(&mut rng) * 800.0)).max(1.0) as u32; + let pkts_recv = (1.0 + bytes_recv as f64 / (400.0 + lcg_f64(&mut rng) * 800.0)).max(1.0) as u32; + let entropy = 4.0 + lcg_normal(&mut rng) * 1.0; + + flows.push(Flow { + index: i, time_s, duration_ms, bytes_sent, bytes_recv, + pkts_sent, pkts_recv, protocol, dst_port, src_subnet, dst_subnet, + is_internal_dst: is_internal, + syn_flag: lcg_f64(&mut rng) < 0.6, + ack_flag: lcg_f64(&mut rng) < 0.8, + rst_flag: lcg_f64(&mut rng) < 0.05, + psh_flag: lcg_f64(&mut rng) < 0.3, + fin_flag: lcg_f64(&mut rng) < 0.4, + payload_entropy: entropy.clamp(0.0, 8.0), + truth: Attack::Normal, + }); + } + + // Inject attacks (~4% = ~80 flows) + // PortScan: rapid small packets to many ports from one source + let scan_src = 2u8; + let scan_start = 14400.0; // 4am + for j in 0..15 { + let idx = 50 + j * 3; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::PortScan; + f.time_s = scan_start + j as f64 * 0.5; + f.duration_ms = 2.0 + lcg_f64(&mut rng) * 10.0; + f.bytes_sent = 40 + (lcg_next(&mut rng) % 60) as u64; + f.bytes_recv = 0; + f.pkts_sent = 1; + f.pkts_recv = 0; + f.protocol = Protocol::Tcp; + f.dst_port = 1024 + (lcg_next(&mut rng) % 64000) as u16; + f.src_subnet = scan_src; + f.syn_flag = true; f.ack_flag = false; f.rst_flag = false; + f.payload_entropy = 0.5 + lcg_f64(&mut rng) * 0.5; + } + + // BruteForce: repeated SSH/RDP from one source, many RSTs + let bf_src = 3u8; + for j in 0..12 { + let idx = 200 + j * 2; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::BruteForce; + f.time_s = 28800.0 + j as f64 * 2.0; // 8am + f.duration_ms = 100.0 + lcg_f64(&mut rng) * 200.0; + f.bytes_sent = 80 + (lcg_next(&mut rng) % 200) as u64; + f.bytes_recv = 60 + (lcg_next(&mut rng) % 100) as u64; + f.pkts_sent = 3 + (lcg_next(&mut rng) % 5) as u32; + f.pkts_recv = 2; + f.protocol = Protocol::Tcp; + f.dst_port = if lcg_f64(&mut rng) < 0.7 { 22 } else { 3389 }; + f.src_subnet = bf_src; + f.dst_subnet = 120; + f.is_internal_dst = false; + f.syn_flag = true; f.rst_flag = lcg_f64(&mut rng) < 0.8; + f.payload_entropy = 2.0 + lcg_f64(&mut rng) * 1.5; + } + + // Exfiltration: large outbound at night + for j in 0..10 { + let idx = 400 + j * 4; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::Exfiltration; + f.time_s = 7200.0 + j as f64 * 120.0; // 2-3am + f.duration_ms = 5000.0 + lcg_f64(&mut rng) * 15000.0; + f.bytes_sent = 500_000 + (lcg_next(&mut rng) % 2_000_000) as u64; + f.bytes_recv = 200 + (lcg_next(&mut rng) % 500) as u64; + f.pkts_sent = 300 + (lcg_next(&mut rng) % 500) as u32; + f.pkts_recv = 5; + f.protocol = Protocol::Tcp; + f.dst_port = 443; + f.src_subnet = 1; + f.dst_subnet = 130; + f.is_internal_dst = false; + f.psh_flag = true; + f.payload_entropy = 7.2 + lcg_f64(&mut rng) * 0.6; + } + + // C2Beacon: periodic small payloads to same external IP + let c2_dst = 140u8; + for j in 0..15 { + let idx = 600 + j * 5; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::C2Beacon; + f.time_s = 3600.0 * j as f64 / 15.0 * 24.0; // spread across day + f.duration_ms = 50.0 + lcg_f64(&mut rng) * 100.0; + f.bytes_sent = 80 + (lcg_next(&mut rng) % 150) as u64; + f.bytes_recv = 60 + (lcg_next(&mut rng) % 120) as u64; + f.pkts_sent = 1; + f.pkts_recv = 1; + f.protocol = Protocol::Tcp; + f.dst_port = 443; + f.src_subnet = 0; + f.dst_subnet = c2_dst; + f.is_internal_dst = false; + f.payload_entropy = 6.5 + lcg_f64(&mut rng) * 1.0; + } + + // DDoS: high packet rate from many sources to single target + for j in 0..18 { + let idx = 900 + j * 2; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::DDoS; + f.time_s = 43200.0 + j as f64 * 0.3; // noon burst + f.duration_ms = 1.0 + lcg_f64(&mut rng) * 5.0; + f.bytes_sent = 40 + (lcg_next(&mut rng) % 80) as u64; + f.bytes_recv = 0; + f.pkts_sent = 50 + (lcg_next(&mut rng) % 200) as u32; + f.pkts_recv = 0; + f.protocol = if lcg_f64(&mut rng) < 0.6 { Protocol::Udp } else { Protocol::Tcp }; + f.dst_port = 80; + f.src_subnet = (lcg_next(&mut rng) % 5) as u8; + f.dst_subnet = 110; + f.is_internal_dst = false; + f.syn_flag = true; f.ack_flag = false; + f.payload_entropy = 1.0 + lcg_f64(&mut rng) * 1.0; + } + + // LateralMovement: internal-to-internal, unusual ports + for j in 0..10 { + let idx = 1200 + j * 3; + if idx >= n { break; } + let f = &mut flows[idx]; + f.truth = Attack::LateralMovement; + f.time_s = 50400.0 + j as f64 * 60.0; // 2pm + f.duration_ms = 200.0 + lcg_f64(&mut rng) * 1000.0; + f.bytes_sent = 1000 + (lcg_next(&mut rng) % 10000) as u64; + f.bytes_recv = 500 + (lcg_next(&mut rng) % 5000) as u64; + f.pkts_sent = 10 + (lcg_next(&mut rng) % 30) as u32; + f.pkts_recv = 8 + (lcg_next(&mut rng) % 20) as u32; + f.protocol = Protocol::Tcp; + f.dst_port = 445 + (lcg_next(&mut rng) % 100) as u16; // SMB-ish + f.src_subnet = j as u8 % 5; + f.dst_subnet = (j as u8 + 1) % 5; + f.is_internal_dst = true; + f.psh_flag = true; + f.payload_entropy = 5.5 + lcg_f64(&mut rng) * 1.5; + } + + flows.sort_by(|a, b| a.time_s.partial_cmp(&b.time_s).unwrap()); + for (i, f) in flows.iter_mut().enumerate() { f.index = i; } + flows +} + +fn extract_embedding(f: &Flow) -> Vec { + let log_dur = (f.duration_ms.max(1.0)).ln() as f32 / 10.0; + let log_bs = (f.bytes_sent.max(1) as f64).ln() as f32 / 15.0; + let log_br = (f.bytes_recv.max(1) as f64).ln() as f32 / 15.0; + let pkt_ratio = if f.pkts_sent + f.pkts_recv > 0 { + f.pkts_sent as f32 / (f.pkts_sent + f.pkts_recv) as f32 + } else { 0.5 }; + // Protocol one-hot + let (p_tcp, p_udp, p_icmp) = match f.protocol { + Protocol::Tcp => (1.0f32, 0.0, 0.0), + Protocol::Udp => (0.0, 1.0, 0.0), + Protocol::Icmp => (0.0, 0.0, 1.0), + }; + // Port category one-hot + let pc = port_category(f.dst_port); + let (pw, ps, pd, pr, po) = match pc { + "web"=>(1.0f32,0.0,0.0,0.0,0.0), "ssh"=>(0.0,1.0,0.0,0.0,0.0), + "dns"=>(0.0,0.0,1.0,0.0,0.0), "rdp"=>(0.0,0.0,0.0,1.0,0.0), + _=>(0.0,0.0,0.0,0.0,1.0), + }; + let internal = if f.is_internal_dst { 1.0f32 } else { 0.0 }; + let subnet_enc = f.src_subnet as f32 / 5.0; + // Time cyclic + let hour_rad = f.time_s as f32 / 86400.0 * 2.0 * std::f32::consts::PI; + let t_sin = hour_rad.sin(); + let t_cos = hour_rad.cos(); + let dow = 0.5f32; // mid-week placeholder + let entropy = f.payload_entropy as f32 / 8.0; + let avg_pkt = if f.pkts_sent + f.pkts_recv > 0 { + (f.bytes_sent + f.bytes_recv) as f32 / (f.pkts_sent + f.pkts_recv) as f32 / 1500.0 + } else { 0.0 }; + let bpp = if f.bytes_sent > 0 && f.pkts_sent > 0 { + f.bytes_sent as f32 / f.pkts_sent as f32 / 1500.0 + } else { 0.0 }; + // Flag pattern features + let syn_flood = if f.syn_flag && !f.ack_flag { 1.0f32 } else { 0.0 }; + let rst_ind = if f.rst_flag { 1.0f32 } else { 0.0 }; + // Derived + let vol = log_bs + log_br; + let asym = (log_bs - log_br).abs(); + + vec![ + log_dur, log_bs, log_br, pkt_ratio, + p_tcp, p_udp, p_icmp, + pw, ps, pd, pr, po, + internal, subnet_enc, + t_sin, t_cos, dow, + entropy, avg_pkt, bpp, + syn_flood, rst_ind, + vol, asym, + f.pkts_sent as f32 / 100.0, f.pkts_recv as f32 / 100.0, + (f.duration_ms as f32 / 1000.0).min(5.0), + (f.bytes_sent as f64 / f.duration_ms.max(1.0)) as f32 / 1000.0, + entropy * syn_flood, entropy * asym, + log_dur * rst_ind, vol * internal, + ] +} + +fn cosine_sim(a: &[f32], b: &[f32]) -> f64 { + let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64); + for i in 0..a.len().min(b.len()) { + d += a[i] as f64 * b[i] as f64; + na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2); + } + let dn = na.sqrt() * nb.sqrt(); + if dn < 1e-15 { 0.0 } else { d / dn } +} + +fn anomaly_score(f: &Flow) -> f64 { + let vol = ((f.bytes_sent + f.bytes_recv).max(1) as f64).ln(); + let ent = f.payload_entropy; + let dur = (f.duration_ms.max(1.0)).ln(); + let pps = if f.duration_ms > 0.0 { f.pkts_sent as f64 / (f.duration_ms / 1000.0) } else { 0.0 }; + let syn_no_ack = if f.syn_flag && !f.ack_flag { 1.0 } else { 0.0 }; + let rst_p = if f.rst_flag { 0.5 } else { 0.0 }; + let night = if f.time_s < 21600.0 || f.time_s > 79200.0 { 0.3 } else { 0.0 }; + let high_vol = if vol > 12.0 { (vol - 12.0) * 0.3 } else { 0.0 }; + let low_pkt = if f.bytes_sent > 0 && f.pkts_sent <= 1 && f.bytes_recv == 0 { 0.4 } else { 0.0 }; + let high_pps = if pps > 50.0 { (pps - 50.0) * 0.01 } else { 0.0 }; + let high_ent = if ent > 6.5 { (ent - 6.5) * 0.5 } else { 0.0 }; + let short_burst = if dur < 2.0 && f.pkts_sent > 10 { 0.5 } else { 0.0 }; + + 0.2 * syn_no_ack + rst_p + night + high_vol + low_pkt + high_pps + high_ent + short_burst - 0.4 +} + +struct Edge { from: usize, to: usize, weight: f64 } + +fn build_graph(flows: &[Flow], embs: &[Vec], alpha: f64, beta: f64, k: usize) -> Vec { + let m = flows.len(); + let mut edges = Vec::new(); + + // Temporal chain: consecutive flows from same source subnet + for i in 1..m { + if flows[i].src_subnet == flows[i - 1].src_subnet { + let dt = (flows[i].time_s - flows[i - 1].time_s).abs(); + let tw = alpha * (-dt / 300.0).exp(); // decay over 5min + if tw > 1e-6 { + edges.push(Edge { from: i - 1, to: i, weight: tw }); + edges.push(Edge { from: i, to: i - 1, weight: tw }); + } + } + } + + // Same-destination smoothing + for i in 0..m { + for j in (i + 1)..m.min(i + 50) { + if flows[i].dst_subnet == flows[j].dst_subnet && flows[i].dst_port == flows[j].dst_port { + edges.push(Edge { from: i, to: j, weight: alpha * 0.5 }); + edges.push(Edge { from: j, to: i, weight: alpha * 0.5 }); + } + } + } + + // kNN similarity from embeddings + for i in 0..m { + let mut sims: Vec<(usize, f64)> = (0..m) + .filter(|&j| j != i && (j as isize - i as isize).unsigned_abs() > 3) + .map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0))).collect(); + sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + for &(j, s) in sims.iter().take(k) { + if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); } + } + } + + edges +} + +fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec { + let m = lambdas.len(); + let (s, t, n) = (m, m + 1, m + 2); + let mut adj: Vec> = vec![Vec::new(); n]; + let mut caps: Vec = Vec::new(); + let ae = |adj: &mut Vec>, caps: &mut Vec, u: usize, v: usize, c: f64| { + let idx = caps.len(); caps.push(c); caps.push(0.0); + adj[u].push((v, idx)); adj[v].push((u, idx + 1)); + }; + for i in 0..m { + let p0 = lambdas[i].max(0.0); + let p1 = (-lambdas[i]).max(0.0); + if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); } + if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); } + } + for e in edges { + let c = gamma * e.weight; + if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); } + } + loop { + let mut par: Vec> = vec![None; n]; + let mut vis = vec![false; n]; + let mut q = std::collections::VecDeque::new(); + vis[s] = true; q.push_back(s); + while let Some(u) = q.pop_front() { + if u == t { break; } + for &(v, ei) in &adj[u] { + if !vis[v] && caps[ei] > 1e-15 { vis[v] = true; par[v] = Some((u, ei)); q.push_back(v); } + } + } + if !vis[t] { break; } + let mut bn = f64::MAX; + let mut v = t; + while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; } + v = t; + while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; } + } + let mut reach = vec![false; n]; + let mut stk = vec![s]; reach[s] = true; + while let Some(u) = stk.pop() { + for &(v, ei) in &adj[u] { + if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); } + } + } + (0..m).map(|i| reach[i]).collect() +} + +fn threshold_detect(flows: &[Flow]) -> Vec { + // Simple baseline: volume > 3 sigma or entropy > 6.0 + let vols: Vec = flows.iter().map(|f| (f.bytes_sent + f.bytes_recv) as f64).collect(); + let mean = vols.iter().sum::() / vols.len() as f64; + let std = (vols.iter().map(|v| (v - mean).powi(2)).sum::() / vols.len() as f64).sqrt(); + flows.iter().enumerate().map(|(i, f)| { + vols[i] > mean + 3.0 * std || f.payload_entropy > 6.0 + }).collect() +} + +struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 } + +fn evaluate(flows: &[Flow], preds: &[bool]) -> Metrics { + let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0); + for (f, &p) in flows.iter().zip(preds) { + let truth = f.truth != Attack::Normal; + match (p, truth) { + (true, true) => tp += 1, (true, false) => fp += 1, + (false, false) => tn += 1, (false, true) => fn_ += 1, + } + } + let prec = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 }; + let rec = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 }; + let f1 = if prec + rec > 0.0 { 2.0 * prec * rec / (prec + rec) } else { 0.0 }; + let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 }; + Metrics { precision: prec, recall: rec, f1, fpr } +} + +fn per_attack_detection(flows: &[Flow], preds: &[bool]) -> Vec<(&'static str, usize, usize)> { + let attacks = [ + Attack::PortScan, Attack::BruteForce, Attack::Exfiltration, + Attack::C2Beacon, Attack::DDoS, Attack::LateralMovement, + ]; + attacks.iter().map(|at| { + let total = flows.iter().filter(|f| f.truth == *at).count(); + let det = flows.iter().zip(preds).filter(|(f, &p)| f.truth == *at && p).count(); + (at.label(), det, total) + }).collect() +} + +fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() } + +fn main() { + println!("=== Cyber Threat Detection via Graph Cut + RuVector ===\n"); + + let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.5, 3usize); + let tmp = TempDir::new().expect("tmpdir"); + let opts = RvfOptions { dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default() }; + let mut store = RvfStore::create(&tmp.path().join("cyber_flows.rvnet"), opts).expect("create"); + + println!(" Flows: {} | Dim: {} | alpha={} beta={} gamma={} k={}\n", + N_FLOWS, DIM, alpha, beta, gamma, k_nn); + + let flows = generate_flows(N_FLOWS, 42); + let n_attack = flows.iter().filter(|f| f.truth != Attack::Normal).count(); + println!(" Generated: {} flows, {} attacks ({:.1}%)", + N_FLOWS, n_attack, n_attack as f64 / N_FLOWS as f64 * 100.0); + + println!("\n Attack Distribution:"); + for at in &[Attack::PortScan, Attack::BruteForce, Attack::Exfiltration, + Attack::C2Beacon, Attack::DDoS, Attack::LateralMovement] { + let c = flows.iter().filter(|f| f.truth == *at).count(); + if c > 0 { println!(" {:>12}: {:>3}", at.label(), c); } + } + + // Extract embeddings + let embs: Vec> = flows.iter().map(|f| extract_embedding(f)).collect(); + + // Build graph and solve + let lam: Vec = flows.iter().map(|f| anomaly_score(f)).collect(); + let edges = build_graph(&flows, &embs, alpha, beta, k_nn); + println!("\n Graph: {} edges ({:.1} per flow)", edges.len(), edges.len() as f64 / N_FLOWS as f64); + + let gc_preds = solve_mincut(&lam, &edges, gamma); + let th_preds = threshold_detect(&flows); + + let gc_m = evaluate(&flows, &gc_preds); + let th_m = evaluate(&flows, &th_preds); + + println!("\n {:>12} {:>8} {:>8} {:>8} {:>8}", "Method", "Prec", "Recall", "F1", "FPR"); + println!(" {:->12} {:->8} {:->8} {:->8} {:->8}", "", "", "", "", ""); + println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", "Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr); + println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", "Threshold", th_m.precision, th_m.recall, th_m.f1, th_m.fpr); + + println!("\n Per-Attack Detection (Graph Cut vs Threshold):"); + println!(" {:>12} {:>8} {:>10}", "Attack", "GC", "Threshold"); + println!(" {:->12} {:->8} {:->10}", "", "", ""); + let gc_pa = per_attack_detection(&flows, &gc_preds); + let th_pa = per_attack_detection(&flows, &th_preds); + for (gc, th) in gc_pa.iter().zip(th_pa.iter()) { + println!(" {:>12} {:>3}/{:<3} {:>5}/{:<3}", + gc.0, gc.1, gc.2, th.1, th.2); + } + + // RVF ingestion + println!("\n--- RVF Ingestion ---"); + let mut vecs = Vec::new(); + let mut ids = Vec::new(); + let mut meta = Vec::new(); + for (i, f) in flows.iter().enumerate() { + vecs.push(embs[i].clone()); + ids.push(i as u64); + meta.push(MetadataEntry { field_id: FIELD_PROTOCOL, + value: MetadataValue::String(f.protocol.label().into()) }); + meta.push(MetadataEntry { field_id: FIELD_PORT_CAT, + value: MetadataValue::String(port_category(f.dst_port).into()) }); + meta.push(MetadataEntry { field_id: FIELD_SUBNET, + value: MetadataValue::U64(f.src_subnet as u64) }); + meta.push(MetadataEntry { field_id: FIELD_ATTACK, + value: MetadataValue::String(f.truth.label().into()) }); + } + let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect(); + let ing = store.ingest_batch(&refs, &ids, Some(&meta)).expect("ingest"); + println!(" Ingested: {} (rejected: {})", ing.accepted, ing.rejected); + + // Filtered queries + println!("\n--- RVF Queries ---"); + // Find flows similar to a known portscan + let scan_idx = flows.iter().position(|f| f.truth == Attack::PortScan).unwrap_or(0); + let qv = &embs[scan_idx]; + let res = store.query(qv, 10, &QueryOptions::default()).expect("q"); + println!(" PortScan-similar (flow {}): {} results", scan_idx, res.len()); + for r in res.iter().take(5) { + let at = flows[r.id as usize].truth.label(); + println!(" id={:>5} dist={:.4} attack={}", r.id, r.distance, at); + } + + // Filter by protocol + let ft = FilterExpr::Eq(FIELD_PROTOCOL, FilterValue::String("TCP".into())); + let tr = store.query(qv, 5, &QueryOptions { filter: Some(ft), ..Default::default() }).expect("q"); + println!(" TCP-only: {} results", tr.len()); + + let fu = FilterExpr::Eq(FIELD_PROTOCOL, FilterValue::String("UDP".into())); + let ur = store.query(qv, 5, &QueryOptions { filter: Some(fu), ..Default::default() }).expect("q"); + println!(" UDP-only: {} results", ur.len()); + + // Witness chain: incident response audit trail + println!("\n--- Witness Chain (Incident Response) ---"); + let steps = [ + ("capture", 0x08u8), ("normalize", 0x02), ("embed", 0x02), + ("graph_build", 0x02), ("mincut_solve", 0x02), ("classify", 0x02), + ("alert", 0x01), ("forensics", 0x02), ("rvf_ingest", 0x08), + ("query", 0x02), ("report", 0x01), ("seal", 0x01), + ]; + let entries: Vec = steps.iter().enumerate().map(|(i, (s, wt))| WitnessEntry { + prev_hash: [0u8; 32], + action_hash: shake256_256(format!("cyber:{}:{}", s, i).as_bytes()), + timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000, + witness_type: *wt, + }).collect(); + let chain = create_witness_chain(&entries); + let verified = verify_witness_chain(&chain).expect("verify"); + println!(" {} entries, {} bytes, VALID", verified.len(), chain.len()); + for (i, (s, _)) in steps.iter().enumerate() { + let wn = match verified[i].witness_type { 0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????" }; + println!(" [{:>4}] {:>2} -> {}", wn, i, s); + } + + // Lineage + println!("\n--- Lineage ---"); + let child = store.derive(&tmp.path().join("alerts.rvnet"), DerivationType::Filter, None).expect("derive"); + let lineage_depth = child.lineage_depth(); + println!(" parent: {} -> child: {} (depth {})", + hex(store.file_id()), hex(child.parent_id()), lineage_depth); + child.close().expect("close"); + + // Summary + println!("\n=== Summary ==="); + println!(" {} flows | {} attacks ({:.1}%) | {} embeddings ingested", + N_FLOWS, n_attack, n_attack as f64 / N_FLOWS as f64 * 100.0, ing.accepted); + println!(" Graph Cut F1={:.3} Prec={:.3} Rec={:.3} FPR={:.3}", + gc_m.f1, gc_m.precision, gc_m.recall, gc_m.fpr); + println!(" Threshold F1={:.3} Prec={:.3} Rec={:.3} FPR={:.3}", + th_m.f1, th_m.precision, th_m.recall, th_m.fpr); + println!(" Witness: {} steps | Lineage depth: {}", verified.len(), lineage_depth); + + store.close().expect("close"); + println!("\nDone."); +} diff --git a/examples/rvf/examples/financial_fraud_graphcut.rs b/examples/rvf/examples/financial_fraud_graphcut.rs new file mode 100644 index 00000000..f2f5fc01 --- /dev/null +++ b/examples/rvf/examples/financial_fraud_graphcut.rs @@ -0,0 +1,518 @@ +//! Financial Fraud Detection via Graph Cut / MRF Optimization + RuVector +//! +//! Applies the same MRF/mincut formulation used in genomic and medical pipelines +//! to financial transaction fraud detection: +//! +//! 1. Generate ~2000 synthetic transactions with realistic distributions +//! 2. Inject fraud patterns: card-not-present, account takeover, card clone, +//! synthetic identity, refund abuse (~0.5% fraud rate) +//! 3. Extract 32-dim embeddings, build transaction graph, solve s-t mincut +//! 4. Compare graph cut vs simple threshold baseline +//! 5. Store embeddings in RVF with merchant/amount metadata, witness chain +//! +//! Run: cargo run --example financial_fraud_graphcut --release + +use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore}; +use rvf_runtime::filter::FilterValue; +use rvf_runtime::options::DistanceMetric; +use rvf_types::DerivationType; +use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry}; +use tempfile::TempDir; + +const DIM: usize = 32; +const N_TX: usize = 2000; +const FIELD_MERCHANT: u16 = 0; +const FIELD_AMOUNT_BUCKET: u16 = 1; +const FIELD_FRAUD_TYPE: u16 = 2; + +fn lcg_next(s: &mut u64) -> u64 { + *s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s +} +fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) } +fn lcg_normal(s: &mut u64) -> f64 { + let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s); + (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos() +} + +#[derive(Debug, Clone, Copy, PartialEq)] +enum FraudType { Legit, CardNotPresent, AccountTakeover, CardClone, Synthetic, Refund } +impl FraudType { + fn label(&self) -> &'static str { + match self { + Self::Legit => "legit", Self::CardNotPresent => "CNP", + Self::AccountTakeover => "ATO", Self::CardClone => "clone", + Self::Synthetic => "synth", Self::Refund => "refund", + } + } +} + +const MERCHANTS: [&str; 10] = [ + "grocery", "gas", "restaurant", "online_retail", "travel", + "electronics", "pharmacy", "subscription", "atm", "luxury", +]; + +#[derive(Debug, Clone)] +struct Transaction { + id: usize, + card_id: usize, + amount: f64, + hour: f64, + merchant_cat: usize, + card_present: bool, + geo_distance_km: f64, + velocity: f64, + v_features: [f64; 10], + fraud: FraudType, +} + +fn generate_transactions(n: usize, seed: u64) -> Vec { + let mut rng = seed; + let n_cards = 200; + let mut txs = Vec::with_capacity(n); + let mut card_last_hour = vec![0.0f64; n_cards]; + let mut card_tx_count = vec![0usize; n_cards]; + let mut card_last_geo = vec![0.0f64; n_cards]; + + for i in 0..n { + let card_id = (lcg_next(&mut rng) as usize) % n_cards; + // Log-normal amount: median ~$50, mean ~$88 + let log_amt = 3.9 + lcg_normal(&mut rng) * 0.8; + let amount = log_amt.exp().min(5000.0); + // Time of day: bimodal (lunch 12h, evening 19h) + let hour = if lcg_f64(&mut rng) < 0.6 { + 12.0 + lcg_normal(&mut rng) * 3.0 + } else { + 19.0 + lcg_normal(&mut rng) * 2.5 + }.rem_euclid(24.0); + let merchant_cat = (lcg_next(&mut rng) as usize) % 10; + let card_present = lcg_f64(&mut rng) < 0.7; + let home_geo = (card_id as f64 * 7.3) % 180.0; + let geo_distance_km = (lcg_f64(&mut rng) * 15.0).max(0.1); + let dt = hour - card_last_hour[card_id]; + let velocity = if dt.abs() > 0.1 { + card_tx_count[card_id] as f64 / dt.abs().max(0.5) + } else { card_tx_count[card_id] as f64 + 1.0 }; + + // PCA-like features V1-V10 from real distribution shapes + let mut vf = [0.0f64; 10]; + for k in 0..10 { + vf[k] = lcg_normal(&mut rng) * (1.0 / (k as f64 + 1.0).sqrt()); + } + + card_last_hour[card_id] = hour; + card_last_geo[card_id] = home_geo; + card_tx_count[card_id] += 1; + + txs.push(Transaction { + id: i, card_id, amount, hour, merchant_cat, card_present, + geo_distance_km, velocity, v_features: vf, fraud: FraudType::Legit, + }); + } + + // Inject fraud (~0.5% = ~10 transactions across 5 types) + let fraud_types = [ + FraudType::CardNotPresent, FraudType::AccountTakeover, + FraudType::CardClone, FraudType::Synthetic, FraudType::Refund, + ]; + for ft in &fraud_types { + let count = 2; + for _ in 0..count { + let idx = (lcg_next(&mut rng) as usize) % n; + let tx = &mut txs[idx]; + tx.fraud = *ft; + match ft { + FraudType::CardNotPresent => { + tx.card_present = false; + tx.amount = (tx.amount * 3.5 + 200.0).min(4500.0); + tx.hour = 3.0 + lcg_f64(&mut rng) * 3.0; // 3-6 AM + tx.merchant_cat = 3; // online_retail + tx.v_features[0] += 2.5; tx.v_features[1] -= 1.8; + } + FraudType::AccountTakeover => { + tx.velocity = 8.0 + lcg_f64(&mut rng) * 5.0; // burst + tx.amount = 500.0 + lcg_f64(&mut rng) * 2000.0; + tx.v_features[2] += 3.0; tx.v_features[3] -= 2.0; + } + FraudType::CardClone => { + tx.geo_distance_km = 800.0 + lcg_f64(&mut rng) * 5000.0; // impossible travel + tx.card_present = true; + tx.v_features[4] += 2.8; tx.v_features[5] -= 1.5; + } + FraudType::Synthetic => { + tx.amount = 50.0 + lcg_f64(&mut rng) * 150.0; // gradual escalation + tx.merchant_cat = 9; // luxury + tx.v_features[6] += 2.0; tx.v_features[7] += 1.5; + } + FraudType::Refund => { + tx.amount = -(20.0 + lcg_f64(&mut rng) * 80.0); // refund (negative) + tx.v_features[8] -= 2.5; tx.v_features[9] += 1.8; + } + _ => {} + } + } + } + txs +} + +fn extract_embedding(tx: &Transaction, mean_amt: f64, std_amt: f64) -> Vec { + let log_amt = (tx.amount.abs() + 1.0).ln(); + let hour_sin = (tx.hour * std::f64::consts::PI / 12.0).sin(); + let hour_cos = (tx.hour * std::f64::consts::PI / 12.0).cos(); + let mut cat_oh = [0.0f32; 10]; + cat_oh[tx.merchant_cat] = 1.0; + let cp_flag = if tx.card_present { 1.0f32 } else { 0.0 }; + let geo_log = (tx.geo_distance_km + 1.0).ln(); + let vel_log = (tx.velocity + 1.0).ln(); + let amt_zscore = if std_amt > 1e-6 { (tx.amount - mean_amt) / std_amt } else { 0.0 }; + let amt_vel = log_amt * vel_log; + let geo_time = geo_log * (tx.hour / 24.0); + + let mut emb = Vec::with_capacity(DIM); + emb.push(log_amt as f32); // 0 + emb.push(hour_sin as f32); // 1 + emb.push(hour_cos as f32); // 2 + for v in &cat_oh { emb.push(*v); } // 3-12 + emb.push(cp_flag); // 13 + emb.push(geo_log as f32); // 14 + emb.push(vel_log as f32); // 15 + emb.push(amt_zscore as f32); // 16 + for k in 0..10 { emb.push(tx.v_features[k] as f32); } // 17-26 + emb.push(amt_vel as f32); // 27 + emb.push(geo_time as f32); // 28 + emb.push(if tx.amount < 0.0 { 1.0 } else { 0.0 }); // 29 refund flag + emb.push((tx.id as f64 / N_TX as f64) as f32); // 30 temporal position + while emb.len() < DIM { emb.push(0.0); } + emb.truncate(DIM); + emb +} + +fn cosine_sim(a: &[f32], b: &[f32]) -> f64 { + let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64); + for i in 0..a.len().min(b.len()) { + d += a[i] as f64 * b[i] as f64; + na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2); + } + let dn = na.sqrt() * nb.sqrt(); + if dn < 1e-15 { 0.0 } else { d / dn } +} + +fn unary_anomaly_score(tx: &Transaction, mean_amt: f64, std_amt: f64) -> f64 { + let amt_dev = ((tx.amount - mean_amt) / std_amt.max(1e-6)).abs(); + let time_anom = if tx.hour < 5.0 || tx.hour > 23.0 { 0.8 } else { 0.0 }; + let vel_spike = if tx.velocity > 5.0 { (tx.velocity - 5.0) * 0.3 } else { 0.0 }; + let geo_imp = if tx.geo_distance_km > 500.0 { + (tx.geo_distance_km / 1000.0).min(2.0) + } else { 0.0 }; + let refund_flag = if tx.amount < 0.0 { 0.6 } else { 0.0 }; + 0.4 * amt_dev + 0.5 * time_anom + 0.6 * vel_spike + 0.8 * geo_imp + refund_flag - 0.7 +} + +struct Edge { from: usize, to: usize, weight: f64 } + +fn build_graph(txs: &[Transaction], embs: &[Vec], + alpha: f64, beta: f64, k: usize) -> Vec { + let m = txs.len(); + let mut edges = Vec::new(); + + // Temporal chain: consecutive transactions by same card + let mut by_card: Vec> = vec![Vec::new(); 200]; + for (i, tx) in txs.iter().enumerate() { by_card[tx.card_id].push(i); } + for card_txs in &by_card { + for w in card_txs.windows(2) { + edges.push(Edge { from: w[0], to: w[1], weight: alpha }); + edges.push(Edge { from: w[1], to: w[0], weight: alpha }); + } + } + + // Merchant-category edges: connect same-merchant transactions (sampled) + let mut by_merchant: Vec> = vec![Vec::new(); 10]; + for (i, tx) in txs.iter().enumerate() { by_merchant[tx.merchant_cat].push(i); } + for mtxs in &by_merchant { + for w in mtxs.windows(2) { + edges.push(Edge { from: w[0], to: w[1], weight: alpha * 0.3 }); + edges.push(Edge { from: w[1], to: w[0], weight: alpha * 0.3 }); + } + } + + // kNN similarity edges from embeddings + for i in 0..m { + let mut sims: Vec<(usize, f64)> = (0..m) + .filter(|&j| j != i) + .map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0))) + .collect(); + sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + for &(j, s) in sims.iter().take(k) { + if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); } + } + } + edges +} + +fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec { + let m = lambdas.len(); + let (s, t, n) = (m, m + 1, m + 2); + let mut adj: Vec> = vec![Vec::new(); n]; + let mut caps: Vec = Vec::new(); + let ae = |adj: &mut Vec>, caps: &mut Vec, + u: usize, v: usize, c: f64| { + let idx = caps.len(); caps.push(c); caps.push(0.0); + adj[u].push((v, idx)); adj[v].push((u, idx + 1)); + }; + for i in 0..m { + let p0 = lambdas[i].max(0.0); + let p1 = (-lambdas[i]).max(0.0); + if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); } + if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); } + } + for e in edges { + let c = gamma * e.weight; + if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); } + } + loop { + let mut par: Vec> = vec![None; n]; + let mut vis = vec![false; n]; + let mut q = std::collections::VecDeque::new(); + vis[s] = true; q.push_back(s); + while let Some(u) = q.pop_front() { + if u == t { break; } + for &(v, ei) in &adj[u] { + if !vis[v] && caps[ei] > 1e-15 { + vis[v] = true; par[v] = Some((u, ei)); q.push_back(v); + } + } + } + if !vis[t] { break; } + let mut bn = f64::MAX; + let mut v = t; + while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; } + v = t; + while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; } + } + let mut reach = vec![false; n]; + let mut stk = vec![s]; reach[s] = true; + while let Some(u) = stk.pop() { + for &(v, ei) in &adj[u] { + if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); } + } + } + (0..m).map(|i| reach[i]).collect() +} + +fn threshold_detect(txs: &[Transaction], mean_amt: f64, std_amt: f64) -> Vec { + txs.iter().map(|tx| { + let amt_z = ((tx.amount - mean_amt) / std_amt.max(1e-6)).abs(); + amt_z > 3.0 || tx.velocity > 5.0 || tx.geo_distance_km > 500.0 || tx.amount < 0.0 + }).collect() +} + +struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 } + +fn evaluate(txs: &[Transaction], preds: &[bool]) -> Metrics { + let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0); + for (tx, &p) in txs.iter().zip(preds) { + let truth = tx.fraud != FraudType::Legit; + match (p, truth) { + (true, true) => tp += 1, (true, false) => fp += 1, + (false, false) => tn += 1, (false, true) => fn_ += 1, + } + } + let precision = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 }; + let recall = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 }; + let f1 = if precision + recall > 0.0 { 2.0 * precision * recall / (precision + recall) } else { 0.0 }; + let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 }; + Metrics { precision, recall, f1, fpr } +} + +fn per_type_detection(txs: &[Transaction], preds: &[bool]) -> Vec<(&'static str, usize, usize)> { + let types = [ + FraudType::CardNotPresent, FraudType::AccountTakeover, + FraudType::CardClone, FraudType::Synthetic, FraudType::Refund, + ]; + types.iter().map(|ft| { + let total = txs.iter().filter(|tx| tx.fraud == *ft).count(); + let detected = txs.iter().zip(preds).filter(|(tx, &p)| tx.fraud == *ft && p).count(); + (ft.label(), detected, total) + }).collect() +} + +fn amount_bucket(amt: f64) -> &'static str { + let a = amt.abs(); + if a < 25.0 { "micro" } else if a < 100.0 { "small" } + else if a < 500.0 { "medium" } else if a < 2000.0 { "large" } + else { "xlarge" } +} + +fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() } + +fn main() { + println!("=== Financial Fraud Detection via Graph Cut / MRF ===\n"); + + let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.35, 3usize); + let seed = 42u64; + + // Generate transactions + let txs = generate_transactions(N_TX, seed); + let n_fraud = txs.iter().filter(|tx| tx.fraud != FraudType::Legit).count(); + let fraud_rate = n_fraud as f64 / N_TX as f64 * 100.0; + println!(" Transactions: {} | Fraud: {} ({:.2}%) | Cards: 200\n", N_TX, n_fraud, fraud_rate); + + // Distribution stats + let amounts: Vec = txs.iter().map(|tx| tx.amount).collect(); + let mean_amt = amounts.iter().sum::() / amounts.len() as f64; + let std_amt = (amounts.iter().map(|a| (a - mean_amt).powi(2)).sum::() + / amounts.len() as f64).sqrt(); + let pos_amounts: Vec = amounts.iter().filter(|&&a| a > 0.0).cloned().collect(); + let mut sorted_pos = pos_amounts.clone(); + sorted_pos.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let median = sorted_pos[sorted_pos.len() / 2]; + println!(" Amount stats: median=${:.0}, mean=${:.0}, std=${:.0}", median, mean_amt, std_amt); + + println!(" Fraud breakdown:"); + for ft in &[FraudType::CardNotPresent, FraudType::AccountTakeover, + FraudType::CardClone, FraudType::Synthetic, FraudType::Refund] { + let c = txs.iter().filter(|tx| tx.fraud == *ft).count(); + if c > 0 { println!(" {}: {}", ft.label(), c); } + } + + // Extract embeddings + let embs: Vec> = txs.iter() + .map(|tx| extract_embedding(tx, mean_amt, std_amt)).collect(); + + // Unary anomaly scores + let lambdas: Vec = txs.iter() + .map(|tx| unary_anomaly_score(tx, mean_amt, std_amt)).collect(); + + // Build graph and solve mincut + println!("\n Building transaction graph..."); + let edges = build_graph(&txs, &embs, alpha, beta, k_nn); + println!(" Graph: {} nodes, {} edges", N_TX, edges.len()); + + let gc_preds = solve_mincut(&lambdas, &edges, gamma); + let gc_flagged = gc_preds.iter().filter(|&&p| p).count(); + + let th_preds = threshold_detect(&txs, mean_amt, std_amt); + let th_flagged = th_preds.iter().filter(|&&p| p).count(); + + // Evaluate + let gc_m = evaluate(&txs, &gc_preds); + let th_m = evaluate(&txs, &th_preds); + + println!("\n {:>12} {:>9} {:>9} {:>9} {:>9} {:>7}", + "Method", "Prec", "Recall", "F1", "FPR", "Flagged"); + println!(" {:->12} {:->9} {:->9} {:->9} {:->9} {:->7}", "", "", "", "", "", ""); + println!(" {:>12} {:>9.3} {:>9.3} {:>9.3} {:>9.4} {:>7}", + "Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr, gc_flagged); + println!(" {:>12} {:>9.3} {:>9.3} {:>9.3} {:>9.4} {:>7}", + "Threshold", th_m.precision, th_m.recall, th_m.f1, th_m.fpr, th_flagged); + + let f1_imp = if th_m.f1 > 0.0 { (gc_m.f1 - th_m.f1) / th_m.f1 * 100.0 } else { 0.0 }; + let fpr_imp = if th_m.fpr > 0.0 { (th_m.fpr - gc_m.fpr) / th_m.fpr * 100.0 } else { 0.0 }; + println!("\n Graph cut vs threshold: F1 {:+.1}%, FPR reduction {:.1}%", f1_imp, fpr_imp); + + // Per-type detection + println!("\n Per-fraud-type detection:"); + println!(" {:>8} {:>12} {:>12}", "Type", "GraphCut", "Threshold"); + println!(" {:->8} {:->12} {:->12}", "", "", ""); + let gc_types = per_type_detection(&txs, &gc_preds); + let th_types = per_type_detection(&txs, &th_preds); + for (gc_t, th_t) in gc_types.iter().zip(th_types.iter()) { + println!(" {:>8} {:>5}/{:<5} {:>5}/{:<5}", + gc_t.0, gc_t.1, gc_t.2, th_t.1, th_t.2); + } + + // RVF ingestion + println!("\n--- RVF Ingestion ---"); + let tmp = TempDir::new().expect("tmpdir"); + let opts = RvfOptions { + dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default() + }; + let mut store = RvfStore::create(&tmp.path().join("fraud.rvfin"), opts).expect("create"); + + let mut vecs: Vec> = Vec::new(); + let mut ids: Vec = Vec::new(); + let mut meta: Vec = Vec::new(); + + for (i, tx) in txs.iter().enumerate() { + vecs.push(embs[i].clone()); + ids.push(i as u64); + meta.push(MetadataEntry { + field_id: FIELD_MERCHANT, + value: MetadataValue::String(MERCHANTS[tx.merchant_cat].into()), + }); + meta.push(MetadataEntry { + field_id: FIELD_AMOUNT_BUCKET, + value: MetadataValue::String(amount_bucket(tx.amount).into()), + }); + meta.push(MetadataEntry { + field_id: FIELD_FRAUD_TYPE, + value: MetadataValue::String(tx.fraud.label().into()), + }); + } + + let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect(); + let ing = store.ingest_batch(&refs, &ids, Some(&meta)).expect("ingest"); + println!(" Ingested: {} (rejected: {})", ing.accepted, ing.rejected); + + // Filtered queries: retrieve similar fraud patterns by merchant + println!("\n--- Filtered Queries ---"); + let fraud_idx = txs.iter().position(|tx| tx.fraud != FraudType::Legit).unwrap_or(0); + let qv = &embs[fraud_idx]; + let res = store.query(qv, 10, &QueryOptions::default()).expect("q"); + println!(" Similar to fraud tx #{}: {} results", fraud_idx, res.len()); + for r in res.iter().take(5) { + let tx = &txs[r.id as usize]; + println!(" id={:>5} dist={:.4} merchant={:<15} fraud={}", + r.id, r.distance, MERCHANTS[tx.merchant_cat], tx.fraud.label()); + } + + let fm = FilterExpr::Eq(FIELD_MERCHANT, FilterValue::String("online_retail".into())); + let mr = store.query(qv, 5, &QueryOptions { filter: Some(fm), ..Default::default() }).expect("q"); + println!(" Online-retail only: {} results", mr.len()); + + let ff = FilterExpr::Eq(FIELD_AMOUNT_BUCKET, FilterValue::String("large".into())); + let lr = store.query(qv, 5, &QueryOptions { filter: Some(ff), ..Default::default() }).expect("q"); + println!(" Large-amount only: {} results", lr.len()); + + // Witness chain: audit trail + println!("\n--- Witness Chain (Audit Trail) ---"); + let steps = [ + ("tx_ingest", 0x08u8), ("feature_extract", 0x02), ("graph_build", 0x02), + ("mincut", 0x02), ("classify", 0x02), ("alert", 0x01), ("seal", 0x01), + ]; + let entries: Vec = steps.iter().enumerate().map(|(i, (step, wt))| WitnessEntry { + prev_hash: [0u8; 32], + action_hash: shake256_256(format!("fraud_gc:{}:{}", step, i).as_bytes()), + timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000, + witness_type: *wt, + }).collect(); + let chain = create_witness_chain(&entries); + let verified = verify_witness_chain(&chain).expect("verify"); + println!(" {} entries, {} bytes, VALID", verified.len(), chain.len()); + for (i, (step, _)) in steps.iter().enumerate() { + let wn = match verified[i].witness_type { + 0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????", + }; + println!(" [{:>4}] {:>2} -> {}", wn, i, step); + } + + // Lineage + println!("\n--- Lineage ---"); + let child = store.derive( + &tmp.path().join("fraud_report.rvfin"), DerivationType::Filter, None, + ).expect("derive"); + println!(" parent: {} -> child: {} (depth {})", + hex(store.file_id()), hex(child.parent_id()), child.lineage_depth()); + child.close().expect("close"); + + // Summary + println!("\n=== Summary ==="); + println!(" {} transactions, {} fraud ({:.2}%)", N_TX, n_fraud, fraud_rate); + println!(" Graph cut: prec={:.3} recall={:.3} F1={:.3} FPR={:.4}", + gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr); + println!(" Threshold: prec={:.3} recall={:.3} F1={:.3} FPR={:.4}", + th_m.precision, th_m.recall, th_m.f1, th_m.fpr); + println!(" RVF: {} embeddings | Witness: {} steps", ing.accepted, verified.len()); + println!(" alpha={:.2} beta={:.2} gamma={:.2} k={}", alpha, beta, gamma, k_nn); + store.close().expect("close"); + println!("\nDone."); +}