mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-23 04:27:11 +00:00
feat: add fintech, cybersecurity, and climate graph-cut examples
- Financial fraud: credit card fraud detection with 5 attack types (card-not-present, account takeover, card clone, synthetic, refund), log-normal transaction amounts, temporal chain + merchant edges - Cybersecurity: network threat detection with 6 attack types (port scan, brute force, exfiltration, C2 beacon, DDoS, lateral movement), flow-level features, source/destination graph edges - Climate: environmental anomaly detection on 30x40 station grid with 6 event types (heat wave, pollution spike, drought, ocean warming, cold snap, sensor fault), spatial adjacency + gradient weighted edges All examples use Edmonds-Karp mincut, RVF witness chains, filtered queries, and lineage derivation. https://claude.ai/code/session_01UWE22wnsZRSHKhT4h4Axby
This commit is contained in:
parent
028f7e2cdb
commit
7fa36fc834
4 changed files with 1720 additions and 0 deletions
|
|
@ -269,6 +269,18 @@ path = "examples/genomic_graphcut.rs"
|
|||
name = "supply_chain_graphcut"
|
||||
path = "examples/supply_chain_graphcut.rs"
|
||||
|
||||
[[example]]
|
||||
name = "financial_fraud_graphcut"
|
||||
path = "examples/financial_fraud_graphcut.rs"
|
||||
|
||||
[[example]]
|
||||
name = "openfang"
|
||||
path = "examples/openfang.rs"
|
||||
|
||||
[[example]]
|
||||
name = "cyber_threat_graphcut"
|
||||
path = "examples/cyber_threat_graphcut.rs"
|
||||
|
||||
[[example]]
|
||||
name = "climate_graphcut"
|
||||
path = "examples/climate_graphcut.rs"
|
||||
|
|
|
|||
574
examples/rvf/examples/climate_graphcut.rs
Normal file
574
examples/rvf/examples/climate_graphcut.rs
Normal file
|
|
@ -0,0 +1,574 @@
|
|||
//! Climate Anomaly Detection via Graph Cut / MRF + RuVector
|
||||
//!
|
||||
//! Applies MRF/mincut optimization to environmental monitoring data:
|
||||
//! 1. Generate synthetic station grid (30x40 = 1200 monitoring stations)
|
||||
//! 2. Inject anomalies: heat waves, pollution spikes, drought, ocean warming,
|
||||
//! cold snaps, sensor faults
|
||||
//! 3. Extract 32-dim embeddings, build spatial+similarity graph, solve mincut
|
||||
//! 4. Store station embeddings in RVF with climate zone metadata
|
||||
//! 5. Evaluate: precision, recall, F1, per-event detection rates
|
||||
//!
|
||||
//! Run: cargo run --example climate_graphcut --release
|
||||
|
||||
use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore};
|
||||
use rvf_runtime::filter::FilterValue;
|
||||
use rvf_runtime::options::DistanceMetric;
|
||||
use rvf_types::DerivationType;
|
||||
use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry};
|
||||
use tempfile::TempDir;
|
||||
|
||||
const ROWS: usize = 30;
|
||||
const COLS: usize = 40;
|
||||
const N_STATIONS: usize = ROWS * COLS;
|
||||
const DIM: usize = 32;
|
||||
|
||||
const FIELD_REGION: u16 = 0;
|
||||
const FIELD_ELEV_CLASS: u16 = 1;
|
||||
const FIELD_CLIMATE_ZONE: u16 = 2;
|
||||
const FIELD_ANOMALY_TYPE: u16 = 3;
|
||||
|
||||
fn lcg_next(s: &mut u64) -> u64 {
|
||||
*s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s
|
||||
}
|
||||
fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) }
|
||||
fn lcg_normal(s: &mut u64) -> f64 {
|
||||
let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s);
|
||||
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum AnomalyType { Normal, HeatWave, PollutionSpike, Drought, OceanWarming, ColdSnap, SensorFault }
|
||||
impl AnomalyType {
|
||||
fn label(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Normal => "normal", Self::HeatWave => "heat_wave",
|
||||
Self::PollutionSpike => "pollution", Self::Drought => "drought",
|
||||
Self::OceanWarming => "ocean_warm", Self::ColdSnap => "cold_snap",
|
||||
Self::SensorFault => "sensor_fault",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Station {
|
||||
row: usize, col: usize,
|
||||
lat: f64, lon: f64, elevation: f64,
|
||||
temperature: f64, humidity: f64, precipitation: f64,
|
||||
wind_speed: f64, aqi: f64, co2: f64,
|
||||
sst: f64, ndvi: f64, day_of_year: f64,
|
||||
is_coastal: bool, truth: AnomalyType,
|
||||
}
|
||||
|
||||
fn climate_zone(lat: f64) -> &'static str {
|
||||
let a = lat.abs();
|
||||
if a > 60.0 { "polar" } else if a > 40.0 { "temperate" } else if a > 23.5 { "subtropical" }
|
||||
else { "tropical" }
|
||||
}
|
||||
|
||||
fn elev_class(e: f64) -> &'static str {
|
||||
if e < 200.0 { "lowland" } else if e < 1000.0 { "highland" } else { "mountain" }
|
||||
}
|
||||
|
||||
fn region_label(row: usize, col: usize) -> &'static str {
|
||||
match (row < ROWS / 2, col < COLS / 2) {
|
||||
(true, true) => "NW", (true, false) => "NE",
|
||||
(false, true) => "SW", _ => "SE",
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_stations(seed: u64, day: f64) -> Vec<Station> {
|
||||
let mut rng = seed;
|
||||
let mut stations = Vec::with_capacity(N_STATIONS);
|
||||
let season = (day / 365.0 * 2.0 * std::f64::consts::PI).sin();
|
||||
let season_cos = (day / 365.0 * 2.0 * std::f64::consts::PI).cos();
|
||||
|
||||
// Pre-generate anomaly clusters (spatial events)
|
||||
let n_heat = 2; let n_poll = 2; let n_drought = 1; let n_cold = 1;
|
||||
let mut heat_centers = Vec::new();
|
||||
for _ in 0..n_heat {
|
||||
heat_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64,
|
||||
3.0 + lcg_f64(&mut rng) * 4.0));
|
||||
}
|
||||
let mut poll_centers = Vec::new();
|
||||
for _ in 0..n_poll {
|
||||
poll_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64,
|
||||
2.0 + lcg_f64(&mut rng) * 3.0));
|
||||
}
|
||||
let mut drought_centers = Vec::new();
|
||||
for _ in 0..n_drought {
|
||||
drought_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64,
|
||||
4.0 + lcg_f64(&mut rng) * 5.0));
|
||||
}
|
||||
let mut cold_centers = Vec::new();
|
||||
for _ in 0..n_cold {
|
||||
cold_centers.push((lcg_f64(&mut rng) * ROWS as f64, lcg_f64(&mut rng) * COLS as f64,
|
||||
3.0 + lcg_f64(&mut rng) * 3.0));
|
||||
}
|
||||
|
||||
for r in 0..ROWS {
|
||||
for c in 0..COLS {
|
||||
let lat = 25.0 + (r as f64 / ROWS as f64) * 50.0; // 25N to 75N
|
||||
let lon = -125.0 + (c as f64 / COLS as f64) * 60.0; // 125W to 65W
|
||||
let elevation = (200.0 + lcg_normal(&mut rng) * 300.0
|
||||
+ 800.0 * ((r as f64 * 0.2).sin() * (c as f64 * 0.15).cos()).abs()).max(0.0);
|
||||
let is_coastal = c < 3 || c >= COLS - 3 || r < 2 || r >= ROWS - 2;
|
||||
|
||||
// Seasonal baseline temperature: warmer at lower latitudes, summer peak
|
||||
let t_base = 30.0 - 0.6 * (lat - 25.0) + 12.0 * season - elevation * 0.006;
|
||||
let temperature = t_base + lcg_normal(&mut rng) * 3.0;
|
||||
let humidity = (60.0 + 15.0 * season_cos + lcg_normal(&mut rng) * 10.0).clamp(10.0, 100.0);
|
||||
let precipitation = (20.0 + 30.0 * (0.5 + 0.5 * season_cos)
|
||||
+ lcg_normal(&mut rng) * 15.0).max(0.0);
|
||||
let wind_speed = (5.0 + lcg_normal(&mut rng) * 3.0).max(0.5);
|
||||
let aqi = (35.0 + lcg_normal(&mut rng) * 15.0).clamp(0.0, 500.0);
|
||||
let co2 = 420.0 + lcg_normal(&mut rng) * 5.0;
|
||||
let sst = if is_coastal {
|
||||
15.0 + 5.0 * season - 0.2 * (lat - 40.0) + lcg_normal(&mut rng) * 1.5
|
||||
} else { 0.0 };
|
||||
let ndvi = (0.5 + 0.15 * season + lcg_normal(&mut rng) * 0.1
|
||||
- elevation * 0.0001).clamp(0.0, 1.0);
|
||||
|
||||
let mut truth = AnomalyType::Normal;
|
||||
|
||||
// Check spatial anomaly clusters
|
||||
for &(cr, cc, rad) in &heat_centers {
|
||||
let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt();
|
||||
if d <= rad { truth = AnomalyType::HeatWave; }
|
||||
}
|
||||
for &(cr, cc, rad) in &poll_centers {
|
||||
let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt();
|
||||
if d <= rad { truth = AnomalyType::PollutionSpike; }
|
||||
}
|
||||
for &(cr, cc, rad) in &drought_centers {
|
||||
let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt();
|
||||
if d <= rad { truth = AnomalyType::Drought; }
|
||||
}
|
||||
for &(cr, cc, rad) in &cold_centers {
|
||||
let d = ((r as f64 - cr).powi(2) + (c as f64 - cc).powi(2)).sqrt();
|
||||
if d <= rad { truth = AnomalyType::ColdSnap; }
|
||||
}
|
||||
// Ocean warming: coastal belt anomaly
|
||||
if truth == AnomalyType::Normal && is_coastal && lat < 45.0 && lcg_f64(&mut rng) < 0.3 {
|
||||
truth = AnomalyType::OceanWarming;
|
||||
}
|
||||
// Sensor fault: rare random
|
||||
if truth == AnomalyType::Normal && lcg_f64(&mut rng) < 0.005 {
|
||||
truth = AnomalyType::SensorFault;
|
||||
}
|
||||
|
||||
// Apply anomaly effects
|
||||
let (temperature, humidity, precipitation, aqi, co2, sst, ndvi) = match truth {
|
||||
AnomalyType::HeatWave => (
|
||||
temperature + 8.0 + lcg_normal(&mut rng) * 2.0,
|
||||
(humidity - 20.0).max(10.0), (precipitation * 0.2).max(0.0),
|
||||
aqi + 30.0, co2 + 10.0, sst, (ndvi - 0.15).max(0.0)),
|
||||
AnomalyType::PollutionSpike => (
|
||||
temperature, humidity,precipitation,
|
||||
(180.0 + lcg_f64(&mut rng) * 200.0).min(500.0),
|
||||
co2 + 25.0 + lcg_normal(&mut rng) * 10.0, sst, ndvi),
|
||||
AnomalyType::Drought => (
|
||||
temperature + 4.0, (humidity - 30.0).max(10.0),
|
||||
(precipitation * 0.05).max(0.0), aqi + 15.0, co2,
|
||||
sst, (ndvi - 0.25).max(0.0)),
|
||||
AnomalyType::OceanWarming => (
|
||||
temperature + 2.0, humidity, precipitation, aqi, co2,
|
||||
sst + 3.0 + lcg_normal(&mut rng) * 0.5, ndvi),
|
||||
AnomalyType::ColdSnap => (
|
||||
temperature - 15.0 - lcg_f64(&mut rng) * 5.0,
|
||||
humidity + 10.0, precipitation + 5.0,
|
||||
aqi, co2, sst, (ndvi - 0.1).max(0.0)),
|
||||
AnomalyType::SensorFault => (
|
||||
-80.0 + lcg_f64(&mut rng) * 160.0,
|
||||
lcg_f64(&mut rng) * 200.0,
|
||||
lcg_f64(&mut rng) * 500.0,
|
||||
lcg_f64(&mut rng) * 500.0,
|
||||
lcg_f64(&mut rng) * 1000.0,
|
||||
if is_coastal { lcg_f64(&mut rng) * 50.0 } else { 0.0 },
|
||||
lcg_f64(&mut rng)),
|
||||
AnomalyType::Normal => (temperature, humidity, precipitation, aqi, co2, sst, ndvi),
|
||||
};
|
||||
|
||||
stations.push(Station {
|
||||
row: r, col: c, lat, lon, elevation, temperature, humidity,
|
||||
precipitation, wind_speed, aqi, co2, sst, ndvi, day_of_year: day,
|
||||
is_coastal, truth,
|
||||
});
|
||||
}
|
||||
}
|
||||
stations
|
||||
}
|
||||
|
||||
fn extract_embedding(st: &Station, stats: &StationStats) -> Vec<f32> {
|
||||
let t_anom = (st.temperature - stats.t_mean) / stats.t_std.max(1e-6);
|
||||
let h_z = (st.humidity - stats.h_mean) / stats.h_std.max(1e-6);
|
||||
let p_z = (st.precipitation - stats.p_mean) / stats.p_std.max(1e-6);
|
||||
let w_z = (st.wind_speed - stats.w_mean) / stats.w_std.max(1e-6);
|
||||
let aqi_log = (st.aqi + 1.0).ln();
|
||||
let co2_dev = (st.co2 - 420.0) / 20.0;
|
||||
let sst_anom = if st.is_coastal { (st.sst - stats.sst_mean) / stats.sst_std.max(1e-6) } else { 0.0 };
|
||||
let ndvi_dev = (st.ndvi - stats.ndvi_mean) / stats.ndvi_std.max(1e-6);
|
||||
let lat_sin = (st.lat * std::f64::consts::PI / 180.0).sin();
|
||||
let lat_cos = (st.lat * std::f64::consts::PI / 180.0).cos();
|
||||
let lon_sin = (st.lon * std::f64::consts::PI / 180.0).sin();
|
||||
let lon_cos = (st.lon * std::f64::consts::PI / 180.0).cos();
|
||||
let elev_log = (st.elevation + 1.0).ln() / 8.0;
|
||||
let season_sin = (st.day_of_year / 365.0 * 2.0 * std::f64::consts::PI).sin();
|
||||
let season_cos = (st.day_of_year / 365.0 * 2.0 * std::f64::consts::PI).cos();
|
||||
|
||||
vec![
|
||||
t_anom as f32, h_z as f32, p_z as f32, w_z as f32, // 0-3: z-scores
|
||||
aqi_log as f32, co2_dev as f32, sst_anom as f32, ndvi_dev as f32, // 4-7: derived
|
||||
lat_sin as f32, lat_cos as f32, lon_sin as f32, lon_cos as f32, // 8-11: position
|
||||
elev_log as f32, season_sin as f32, season_cos as f32, // 12-14: context
|
||||
(t_anom.abs() + h_z.abs()) as f32, // 15: temp+humidity
|
||||
(t_anom * p_z) as f32, // 16: temp*precip
|
||||
(aqi_log * co2_dev) as f32, // 17: aqi*co2
|
||||
(t_anom * ndvi_dev) as f32, // 18: temp*ndvi
|
||||
if st.aqi > 150.0 { 1.0 } else { 0.0 }, // 19: pollution flag
|
||||
if t_anom > 2.0 { 1.0 } else if t_anom < -2.0 { -1.0 } else { 0.0 }, // 20: temp flag
|
||||
(t_anom * t_anom) as f32, // 21: t^2
|
||||
(h_z * h_z) as f32, // 22: h^2
|
||||
(p_z * p_z) as f32, // 23: p^2
|
||||
(sst_anom * sst_anom) as f32, // 24: sst^2
|
||||
(t_anom.abs() * aqi_log) as f32, // 25: |t|*aqi
|
||||
(p_z * ndvi_dev) as f32, // 26: precip*ndvi
|
||||
(co2_dev * t_anom) as f32, // 27: co2*t
|
||||
if st.is_coastal { 1.0 } else { 0.0 }, // 28: coastal flag
|
||||
(st.temperature / 50.0).clamp(-1.0, 1.0) as f32, // 29: raw temp norm
|
||||
(st.aqi / 500.0) as f32, // 30: raw aqi norm
|
||||
(st.ndvi) as f32, // 31: raw ndvi
|
||||
]
|
||||
}
|
||||
|
||||
struct StationStats {
|
||||
t_mean: f64, t_std: f64, h_mean: f64, h_std: f64,
|
||||
p_mean: f64, p_std: f64, w_mean: f64, w_std: f64,
|
||||
sst_mean: f64, sst_std: f64, ndvi_mean: f64, ndvi_std: f64,
|
||||
}
|
||||
|
||||
fn compute_stats(stations: &[Station]) -> StationStats {
|
||||
let n = stations.len() as f64;
|
||||
let mean = |f: &dyn Fn(&Station) -> f64| stations.iter().map(f).sum::<f64>() / n;
|
||||
let std = |f: &dyn Fn(&Station) -> f64, m: f64|
|
||||
(stations.iter().map(|s| (f(s) - m).powi(2)).sum::<f64>() / n).sqrt();
|
||||
let tm = mean(&|s| s.temperature); let hm = mean(&|s| s.humidity);
|
||||
let pm = mean(&|s| s.precipitation); let wm = mean(&|s| s.wind_speed);
|
||||
let coastal: Vec<&Station> = stations.iter().filter(|s| s.is_coastal).collect();
|
||||
let cn = coastal.len().max(1) as f64;
|
||||
let sm = coastal.iter().map(|s| s.sst).sum::<f64>() / cn;
|
||||
let ss = (coastal.iter().map(|s| (s.sst - sm).powi(2)).sum::<f64>() / cn).sqrt();
|
||||
let nm = mean(&|s| s.ndvi);
|
||||
StationStats {
|
||||
t_mean: tm, t_std: std(&|s| s.temperature, tm),
|
||||
h_mean: hm, h_std: std(&|s| s.humidity, hm),
|
||||
p_mean: pm, p_std: std(&|s| s.precipitation, pm),
|
||||
w_mean: wm, w_std: std(&|s| s.wind_speed, wm),
|
||||
sst_mean: sm, sst_std: ss,
|
||||
ndvi_mean: nm, ndvi_std: std(&|s| s.ndvi, nm),
|
||||
}
|
||||
}
|
||||
|
||||
fn unary_score(st: &Station, stats: &StationStats) -> f64 {
|
||||
let t_z = ((st.temperature - stats.t_mean) / stats.t_std.max(1e-6)).abs();
|
||||
let h_z = ((st.humidity - stats.h_mean) / stats.h_std.max(1e-6)).abs();
|
||||
let p_z = ((st.precipitation - stats.p_mean) / stats.p_std.max(1e-6)).abs();
|
||||
let aqi_f = if st.aqi > 100.0 { (st.aqi - 100.0) / 100.0 } else { 0.0 };
|
||||
let co2_f = ((st.co2 - 420.0).abs() / 20.0).max(0.0);
|
||||
let sst_z = if st.is_coastal {
|
||||
((st.sst - stats.sst_mean) / stats.sst_std.max(1e-6)).abs()
|
||||
} else { 0.0 };
|
||||
let ndvi_z = ((st.ndvi - stats.ndvi_mean) / stats.ndvi_std.max(1e-6)).abs();
|
||||
0.3 * t_z + 0.15 * h_z + 0.1 * p_z + 0.2 * aqi_f + 0.1 * co2_f
|
||||
+ 0.15 * sst_z + 0.1 * ndvi_z - 1.2
|
||||
}
|
||||
|
||||
fn cosine_sim(a: &[f32], b: &[f32]) -> f64 {
|
||||
let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64);
|
||||
for i in 0..a.len().min(b.len()) {
|
||||
d += a[i] as f64 * b[i] as f64;
|
||||
na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2);
|
||||
}
|
||||
let dn = na.sqrt() * nb.sqrt();
|
||||
if dn < 1e-15 { 0.0 } else { d / dn }
|
||||
}
|
||||
|
||||
struct Edge { from: usize, to: usize, weight: f64 }
|
||||
|
||||
fn build_graph(stations: &[Station], embs: &[Vec<f32>], alpha: f64, beta: f64, k: usize) -> Vec<Edge> {
|
||||
let mut edges = Vec::new();
|
||||
// 4-connected spatial adjacency
|
||||
for r in 0..ROWS { for c in 0..COLS {
|
||||
let idx = r * COLS + c;
|
||||
for &(dr, dc) in &[(0i32, 1i32), (1, 0)] {
|
||||
let (nr, nc) = (r as i32 + dr, c as i32 + dc);
|
||||
if nr >= ROWS as i32 || nc >= COLS as i32 { continue; }
|
||||
let nidx = nr as usize * COLS + nc as usize;
|
||||
let dist = ((stations[idx].lat - stations[nidx].lat).powi(2)
|
||||
+ (stations[idx].lon - stations[nidx].lon).powi(2)).sqrt();
|
||||
let grad = (stations[idx].temperature - stations[nidx].temperature).abs()
|
||||
+ (stations[idx].aqi - stations[nidx].aqi).abs() * 0.01;
|
||||
let w = alpha * (1.0 / (1.0 + dist)) * (-grad * 0.05).exp();
|
||||
edges.push(Edge { from: idx, to: nidx, weight: w });
|
||||
edges.push(Edge { from: nidx, to: idx, weight: w });
|
||||
}
|
||||
}}
|
||||
// kNN similarity edges from embeddings
|
||||
for i in 0..embs.len() {
|
||||
let mut sims: Vec<(usize, f64)> = (0..embs.len())
|
||||
.filter(|&j| {
|
||||
let (ri, ci) = (i / COLS, i % COLS);
|
||||
let (rj, cj) = (j / COLS, j % COLS);
|
||||
(ri as isize - rj as isize).unsigned_abs() + (ci as isize - cj as isize).unsigned_abs() > 2
|
||||
})
|
||||
.map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0))).collect();
|
||||
sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
for &(j, s) in sims.iter().take(k) {
|
||||
if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); }
|
||||
}
|
||||
}
|
||||
edges
|
||||
}
|
||||
|
||||
fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec<bool> {
|
||||
let m = lambdas.len();
|
||||
let (s, t, n) = (m, m + 1, m + 2);
|
||||
let mut adj: Vec<Vec<(usize, usize)>> = vec![Vec::new(); n];
|
||||
let mut caps: Vec<f64> = Vec::new();
|
||||
let ae = |adj: &mut Vec<Vec<(usize, usize)>>, caps: &mut Vec<f64>, u: usize, v: usize, c: f64| {
|
||||
let idx = caps.len(); caps.push(c); caps.push(0.0);
|
||||
adj[u].push((v, idx)); adj[v].push((u, idx + 1));
|
||||
};
|
||||
for i in 0..m {
|
||||
let p0 = lambdas[i].max(0.0); let p1 = (-lambdas[i]).max(0.0);
|
||||
if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); }
|
||||
if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); }
|
||||
}
|
||||
for e in edges {
|
||||
let c = gamma * e.weight;
|
||||
if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); }
|
||||
}
|
||||
loop {
|
||||
let mut par: Vec<Option<(usize, usize)>> = vec![None; n];
|
||||
let mut vis = vec![false; n];
|
||||
let mut q = std::collections::VecDeque::new();
|
||||
vis[s] = true; q.push_back(s);
|
||||
while let Some(u) = q.pop_front() {
|
||||
if u == t { break; }
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !vis[v] && caps[ei] > 1e-15 { vis[v] = true; par[v] = Some((u, ei)); q.push_back(v); }
|
||||
}
|
||||
}
|
||||
if !vis[t] { break; }
|
||||
let mut bn = f64::MAX;
|
||||
let mut v = t;
|
||||
while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; }
|
||||
v = t;
|
||||
while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; }
|
||||
}
|
||||
let mut reach = vec![false; n];
|
||||
let mut stk = vec![s]; reach[s] = true;
|
||||
while let Some(u) = stk.pop() {
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); }
|
||||
}
|
||||
}
|
||||
(0..m).map(|i| reach[i]).collect()
|
||||
}
|
||||
|
||||
fn threshold_detect(stations: &[Station], stats: &StationStats) -> Vec<bool> {
|
||||
stations.iter().map(|st| {
|
||||
let t_z = ((st.temperature - stats.t_mean) / stats.t_std.max(1e-6)).abs();
|
||||
let aqi_hi = st.aqi > 150.0;
|
||||
let co2_hi = (st.co2 - 420.0).abs() > 40.0;
|
||||
let ndvi_lo = st.ndvi < stats.ndvi_mean - 2.5 * stats.ndvi_std;
|
||||
t_z > 3.0 || aqi_hi || co2_hi || ndvi_lo
|
||||
}).collect()
|
||||
}
|
||||
|
||||
struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 }
|
||||
|
||||
fn evaluate(stations: &[Station], pred: &[bool]) -> Metrics {
|
||||
let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0);
|
||||
for (i, st) in stations.iter().enumerate() {
|
||||
let truth = st.truth != AnomalyType::Normal;
|
||||
match (truth, pred[i]) {
|
||||
(true, true) => tp += 1, (false, true) => fp += 1,
|
||||
(true, false) => fn_ += 1, (false, false) => tn += 1,
|
||||
}
|
||||
}
|
||||
let prec = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 };
|
||||
let rec = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 };
|
||||
let f1 = if prec + rec > 0.0 { 2.0 * prec * rec / (prec + rec) } else { 0.0 };
|
||||
let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 };
|
||||
Metrics { precision: prec, recall: rec, f1, fpr }
|
||||
}
|
||||
|
||||
fn per_event_detection(stations: &[Station], pred: &[bool]) -> Vec<(&'static str, usize, usize)> {
|
||||
let types = [
|
||||
AnomalyType::HeatWave, AnomalyType::PollutionSpike, AnomalyType::Drought,
|
||||
AnomalyType::OceanWarming, AnomalyType::ColdSnap, AnomalyType::SensorFault,
|
||||
];
|
||||
types.iter().map(|&at| {
|
||||
let total = stations.iter().filter(|s| s.truth == at).count();
|
||||
let detected = stations.iter().enumerate()
|
||||
.filter(|(i, s)| s.truth == at && pred[*i]).count();
|
||||
(at.label(), detected, total)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() }
|
||||
|
||||
fn main() {
|
||||
println!("=== Climate Anomaly Detection via Graph Cut / MRF ===\n");
|
||||
let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.5, 3usize);
|
||||
let days = [172.0, 355.0]; // summer solstice, late December
|
||||
let day_labels = ["Summer", "Winter"];
|
||||
|
||||
let tmp = TempDir::new().expect("tmpdir");
|
||||
let opts = RvfOptions { dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default() };
|
||||
let mut store = RvfStore::create(&tmp.path().join("climate.rvenv"), opts).expect("create");
|
||||
|
||||
println!(" Grid: {}x{} = {} stations | Dim: {} | alpha={} beta={} gamma={} k={}\n",
|
||||
ROWS, COLS, N_STATIONS, DIM, alpha, beta, gamma, k_nn);
|
||||
|
||||
let (mut all_vecs, mut all_ids, mut all_meta): (Vec<Vec<f32>>, Vec<u64>, Vec<MetadataEntry>) =
|
||||
(Vec::new(), Vec::new(), Vec::new());
|
||||
|
||||
for (si, &day) in days.iter().enumerate() {
|
||||
let seed = 42 + si as u64 * 7919;
|
||||
let stations = generate_stations(seed, day);
|
||||
let n_anom = stations.iter().filter(|s| s.truth != AnomalyType::Normal).count();
|
||||
println!("--- Season: {} (day {}) ---", day_labels[si], day as u32);
|
||||
println!(" {} stations, {} anomalous ({:.1}%)", N_STATIONS, n_anom,
|
||||
n_anom as f64 / N_STATIONS as f64 * 100.0);
|
||||
for at in &[AnomalyType::HeatWave, AnomalyType::PollutionSpike, AnomalyType::Drought,
|
||||
AnomalyType::OceanWarming, AnomalyType::ColdSnap, AnomalyType::SensorFault] {
|
||||
let c = stations.iter().filter(|s| s.truth == *at).count();
|
||||
if c > 0 { println!(" {}: {}", at.label(), c); }
|
||||
}
|
||||
|
||||
let stats = compute_stats(&stations);
|
||||
let embs: Vec<Vec<f32>> = stations.iter().map(|s| extract_embedding(s, &stats)).collect();
|
||||
let lam: Vec<f64> = stations.iter().map(|s| unary_score(s, &stats)).collect();
|
||||
let edges = build_graph(&stations, &embs, alpha, beta, k_nn);
|
||||
let gc = solve_mincut(&lam, &edges, gamma);
|
||||
let tc = threshold_detect(&stations, &stats);
|
||||
|
||||
let gc_m = evaluate(&stations, &gc);
|
||||
let tc_m = evaluate(&stations, &tc);
|
||||
println!("\n {:>12} {:>8} {:>8} {:>8} {:>8}", "Method", "Prec", "Recall", "F1", "FPR");
|
||||
println!(" {:->12} {:->8} {:->8} {:->8} {:->8}", "", "", "", "", "");
|
||||
println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}",
|
||||
"Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr);
|
||||
println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}",
|
||||
"Threshold", tc_m.precision, tc_m.recall, tc_m.f1, tc_m.fpr);
|
||||
|
||||
println!("\n Per-event detection (Graph Cut):");
|
||||
println!(" {:>14} {:>6} {:>6} {:>8}", "Event", "Det", "Total", "Rate");
|
||||
println!(" {:->14} {:->6} {:->6} {:->8}", "", "", "", "");
|
||||
for (label, det, total) in per_event_detection(&stations, &gc) {
|
||||
if total > 0 {
|
||||
println!(" {:>14} {:>6} {:>6} {:>7.1}%", label, det, total,
|
||||
det as f64 / total as f64 * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
// Spatial coherence: count contiguous anomaly regions in graph cut
|
||||
let mut visited = vec![false; N_STATIONS];
|
||||
let mut n_regions = 0usize;
|
||||
for i in 0..N_STATIONS {
|
||||
if gc[i] && !visited[i] {
|
||||
n_regions += 1;
|
||||
let mut stk = vec![i];
|
||||
while let Some(u) = stk.pop() {
|
||||
if visited[u] { continue; }
|
||||
visited[u] = true;
|
||||
let (ur, uc) = (u / COLS, u % COLS);
|
||||
for &(dr, dc) in &[(-1i32, 0i32), (1, 0), (0, -1), (0, 1)] {
|
||||
let (nr, nc) = (ur as i32 + dr, uc as i32 + dc);
|
||||
if nr >= 0 && nr < ROWS as i32 && nc >= 0 && nc < COLS as i32 {
|
||||
let ni = nr as usize * COLS + nc as usize;
|
||||
if gc[ni] && !visited[ni] { stk.push(ni); }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let gc_count = gc.iter().filter(|&&x| x).count();
|
||||
println!("\n Spatial coherence: {} anomaly regions ({} flagged stations)",
|
||||
n_regions, gc_count);
|
||||
|
||||
// Store embeddings with metadata
|
||||
for (i, emb) in embs.iter().enumerate() {
|
||||
let id = si as u64 * 100_000 + i as u64;
|
||||
all_vecs.push(emb.clone()); all_ids.push(id);
|
||||
all_meta.push(MetadataEntry { field_id: FIELD_REGION,
|
||||
value: MetadataValue::String(region_label(stations[i].row, stations[i].col).into()) });
|
||||
all_meta.push(MetadataEntry { field_id: FIELD_ELEV_CLASS,
|
||||
value: MetadataValue::String(elev_class(stations[i].elevation).into()) });
|
||||
all_meta.push(MetadataEntry { field_id: FIELD_CLIMATE_ZONE,
|
||||
value: MetadataValue::String(climate_zone(stations[i].lat).into()) });
|
||||
all_meta.push(MetadataEntry { field_id: FIELD_ANOMALY_TYPE,
|
||||
value: MetadataValue::String(stations[i].truth.label().into()) });
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
// RVF ingestion
|
||||
println!("--- RVF Ingestion ---");
|
||||
let refs: Vec<&[f32]> = all_vecs.iter().map(|v| v.as_slice()).collect();
|
||||
let ing = store.ingest_batch(&refs, &all_ids, Some(&all_meta)).expect("ingest");
|
||||
println!(" Ingested: {} embeddings (rejected: {})", ing.accepted, ing.rejected);
|
||||
|
||||
// Filtered queries
|
||||
println!("\n--- RVF Queries ---");
|
||||
let qv = all_vecs[0].clone();
|
||||
for zone in &["polar", "temperate", "subtropical", "tropical"] {
|
||||
let f = FilterExpr::Eq(FIELD_CLIMATE_ZONE, FilterValue::String(zone.to_string()));
|
||||
let r = store.query(&qv, 5, &QueryOptions { filter: Some(f), ..Default::default() }).expect("q");
|
||||
println!(" {:<12} -> {} results (top dist: {:.4})", zone, r.len(),
|
||||
r.first().map(|x| x.distance).unwrap_or(0.0));
|
||||
}
|
||||
let fa = FilterExpr::Eq(FIELD_ANOMALY_TYPE, FilterValue::String("heat_wave".into()));
|
||||
let hr = store.query(&qv, 5, &QueryOptions { filter: Some(fa), ..Default::default() }).expect("q");
|
||||
println!(" heat_wave -> {} results", hr.len());
|
||||
|
||||
// Witness chain for environmental reporting audit
|
||||
println!("\n--- Witness Chain (Environmental Audit) ---");
|
||||
let steps = [
|
||||
("genesis", 0x01u8), ("observation", 0x01), ("qc_check", 0x02),
|
||||
("normalize", 0x02), ("embed", 0x02), ("spatial_graph", 0x02),
|
||||
("mincut", 0x02), ("classify", 0x02), ("per_event_eval", 0x02),
|
||||
("alert", 0x01), ("rvf_ingest", 0x08), ("similarity", 0x02),
|
||||
("lineage", 0x01), ("seal", 0x01),
|
||||
];
|
||||
let entries: Vec<WitnessEntry> = steps.iter().enumerate().map(|(i, (s, wt))| WitnessEntry {
|
||||
prev_hash: [0u8; 32],
|
||||
action_hash: shake256_256(format!("climate_gc:{}:{}", s, i).as_bytes()),
|
||||
timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000,
|
||||
witness_type: *wt,
|
||||
}).collect();
|
||||
let chain = create_witness_chain(&entries);
|
||||
let ver = verify_witness_chain(&chain).expect("verify");
|
||||
println!(" {} entries, {} bytes, VALID", ver.len(), chain.len());
|
||||
for (i, (s, _)) in steps.iter().enumerate() {
|
||||
let wn = match ver[i].witness_type { 0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????" };
|
||||
println!(" [{:>4}] {:>2} -> {}", wn, i, s);
|
||||
}
|
||||
|
||||
// Lineage
|
||||
println!("\n--- Lineage ---");
|
||||
let child = store.derive(&tmp.path().join("climate_report.rvenv"),
|
||||
DerivationType::Filter, None).expect("derive");
|
||||
println!(" parent: {} -> child: {} (depth {})",
|
||||
hex(store.file_id()), hex(child.parent_id()), child.lineage_depth());
|
||||
child.close().expect("close");
|
||||
|
||||
// Summary
|
||||
println!("\n=== Summary ===");
|
||||
println!(" {} stations x {} seasons = {} embeddings", N_STATIONS, days.len(), ing.accepted);
|
||||
println!(" Anomaly types: 6 | Witness: {} steps | Climate zones: 4", ver.len());
|
||||
println!(" alpha={:.2} beta={:.2} gamma={:.2} k={}", alpha, beta, gamma, k_nn);
|
||||
store.close().expect("close");
|
||||
println!("\nDone.");
|
||||
}
|
||||
616
examples/rvf/examples/cyber_threat_graphcut.rs
Normal file
616
examples/rvf/examples/cyber_threat_graphcut.rs
Normal file
|
|
@ -0,0 +1,616 @@
|
|||
//! Cybersecurity Network Threat Detection via Graph Cut + RuVector
|
||||
//!
|
||||
//! MRF/mincut optimization for network intrusion detection:
|
||||
//! 1. Generate ~2000 synthetic network flows (24h period, ~4% attack rate)
|
||||
//! 2. Inject realistic attack patterns: PortScan, BruteForce, Exfiltration,
|
||||
//! C2Beacon, DDoS, LateralMovement (CICIDS2017/NSL-KDD inspired)
|
||||
//! 3. Extract 32-dim flow embeddings, build temporal+similarity graph, mincut
|
||||
//! 4. Evaluate: precision, recall, F1, FPR; per-attack detection rates
|
||||
//!
|
||||
//! Run: cargo run --example cyber_threat_graphcut --release
|
||||
|
||||
use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore};
|
||||
use rvf_runtime::filter::FilterValue;
|
||||
use rvf_runtime::options::DistanceMetric;
|
||||
use rvf_types::DerivationType;
|
||||
use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry};
|
||||
use tempfile::TempDir;
|
||||
|
||||
const DIM: usize = 32;
|
||||
const N_FLOWS: usize = 2000;
|
||||
const FIELD_PROTOCOL: u16 = 0;
|
||||
const FIELD_PORT_CAT: u16 = 1;
|
||||
const FIELD_SUBNET: u16 = 2;
|
||||
const FIELD_ATTACK: u16 = 3;
|
||||
|
||||
fn lcg_next(s: &mut u64) -> u64 {
|
||||
*s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s
|
||||
}
|
||||
fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) }
|
||||
fn lcg_normal(s: &mut u64) -> f64 {
|
||||
let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s);
|
||||
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum Attack { Normal, PortScan, BruteForce, Exfiltration, C2Beacon, DDoS, LateralMovement }
|
||||
impl Attack {
|
||||
fn label(&self) -> &'static str {
|
||||
match self {
|
||||
Attack::Normal => "normal", Attack::PortScan => "portscan",
|
||||
Attack::BruteForce => "bruteforce", Attack::Exfiltration => "exfil",
|
||||
Attack::C2Beacon => "c2beacon", Attack::DDoS => "ddos",
|
||||
Attack::LateralMovement => "lateral",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum Protocol { Tcp, Udp, Icmp }
|
||||
impl Protocol {
|
||||
fn label(&self) -> &'static str {
|
||||
match self { Protocol::Tcp => "TCP", Protocol::Udp => "UDP", Protocol::Icmp => "ICMP" }
|
||||
}
|
||||
}
|
||||
|
||||
fn port_category(port: u16) -> &'static str {
|
||||
match port {
|
||||
80 | 443 | 8080 | 8443 => "web", 22 => "ssh", 53 => "dns",
|
||||
3389 => "rdp", 0 => "none", _ => "other",
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Flow {
|
||||
index: usize,
|
||||
time_s: f64, // seconds into 24h window
|
||||
duration_ms: f64,
|
||||
bytes_sent: u64,
|
||||
bytes_recv: u64,
|
||||
pkts_sent: u32,
|
||||
pkts_recv: u32,
|
||||
protocol: Protocol,
|
||||
dst_port: u16,
|
||||
src_subnet: u8, // 10.x.0.0/16 subnet id (0-4 internal)
|
||||
dst_subnet: u8,
|
||||
is_internal_dst: bool,
|
||||
syn_flag: bool,
|
||||
ack_flag: bool,
|
||||
rst_flag: bool,
|
||||
psh_flag: bool,
|
||||
fin_flag: bool,
|
||||
payload_entropy: f64, // 0-8 bits
|
||||
truth: Attack,
|
||||
}
|
||||
|
||||
fn generate_flows(n: usize, seed: u64) -> Vec<Flow> {
|
||||
let mut rng = seed;
|
||||
let mut flows = Vec::with_capacity(n);
|
||||
let ports = [80u16, 443, 22, 53, 3389, 8080, 8443, 25, 110, 993, 3306, 5432];
|
||||
|
||||
for i in 0..n {
|
||||
let time_s = lcg_f64(&mut rng) * 86400.0;
|
||||
let hour = (time_s / 3600.0) as u32;
|
||||
// More traffic during business hours
|
||||
let biz_mult = if (8..18).contains(&hour) { 1.0 } else { 0.4 };
|
||||
let _ = biz_mult; // used implicitly via attack injection timing
|
||||
|
||||
let proto_r = lcg_f64(&mut rng);
|
||||
let protocol = if proto_r < 0.75 { Protocol::Tcp }
|
||||
else if proto_r < 0.92 { Protocol::Udp }
|
||||
else { Protocol::Icmp };
|
||||
|
||||
let port_idx = (lcg_next(&mut rng) % ports.len() as u64) as usize;
|
||||
let dst_port = if protocol == Protocol::Icmp { 0 } else { ports[port_idx] };
|
||||
let src_subnet = (lcg_next(&mut rng) % 5) as u8;
|
||||
let dst_r = lcg_f64(&mut rng);
|
||||
let is_internal = dst_r < 0.3;
|
||||
let dst_subnet = if is_internal { (lcg_next(&mut rng) % 5) as u8 } else { 100 + (lcg_next(&mut rng) % 50) as u8 };
|
||||
|
||||
let duration_ms = (50.0 + lcg_f64(&mut rng) * 2000.0 + lcg_normal(&mut rng).abs() * 500.0).max(1.0);
|
||||
let bytes_sent = (200.0 + lcg_f64(&mut rng) * 5000.0 + lcg_normal(&mut rng).abs() * 1000.0).max(40.0) as u64;
|
||||
let bytes_recv = (500.0 + lcg_f64(&mut rng) * 20000.0 + lcg_normal(&mut rng).abs() * 3000.0).max(40.0) as u64;
|
||||
let pkts_sent = (1.0 + bytes_sent as f64 / (400.0 + lcg_f64(&mut rng) * 800.0)).max(1.0) as u32;
|
||||
let pkts_recv = (1.0 + bytes_recv as f64 / (400.0 + lcg_f64(&mut rng) * 800.0)).max(1.0) as u32;
|
||||
let entropy = 4.0 + lcg_normal(&mut rng) * 1.0;
|
||||
|
||||
flows.push(Flow {
|
||||
index: i, time_s, duration_ms, bytes_sent, bytes_recv,
|
||||
pkts_sent, pkts_recv, protocol, dst_port, src_subnet, dst_subnet,
|
||||
is_internal_dst: is_internal,
|
||||
syn_flag: lcg_f64(&mut rng) < 0.6,
|
||||
ack_flag: lcg_f64(&mut rng) < 0.8,
|
||||
rst_flag: lcg_f64(&mut rng) < 0.05,
|
||||
psh_flag: lcg_f64(&mut rng) < 0.3,
|
||||
fin_flag: lcg_f64(&mut rng) < 0.4,
|
||||
payload_entropy: entropy.clamp(0.0, 8.0),
|
||||
truth: Attack::Normal,
|
||||
});
|
||||
}
|
||||
|
||||
// Inject attacks (~4% = ~80 flows)
|
||||
// PortScan: rapid small packets to many ports from one source
|
||||
let scan_src = 2u8;
|
||||
let scan_start = 14400.0; // 4am
|
||||
for j in 0..15 {
|
||||
let idx = 50 + j * 3;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::PortScan;
|
||||
f.time_s = scan_start + j as f64 * 0.5;
|
||||
f.duration_ms = 2.0 + lcg_f64(&mut rng) * 10.0;
|
||||
f.bytes_sent = 40 + (lcg_next(&mut rng) % 60) as u64;
|
||||
f.bytes_recv = 0;
|
||||
f.pkts_sent = 1;
|
||||
f.pkts_recv = 0;
|
||||
f.protocol = Protocol::Tcp;
|
||||
f.dst_port = 1024 + (lcg_next(&mut rng) % 64000) as u16;
|
||||
f.src_subnet = scan_src;
|
||||
f.syn_flag = true; f.ack_flag = false; f.rst_flag = false;
|
||||
f.payload_entropy = 0.5 + lcg_f64(&mut rng) * 0.5;
|
||||
}
|
||||
|
||||
// BruteForce: repeated SSH/RDP from one source, many RSTs
|
||||
let bf_src = 3u8;
|
||||
for j in 0..12 {
|
||||
let idx = 200 + j * 2;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::BruteForce;
|
||||
f.time_s = 28800.0 + j as f64 * 2.0; // 8am
|
||||
f.duration_ms = 100.0 + lcg_f64(&mut rng) * 200.0;
|
||||
f.bytes_sent = 80 + (lcg_next(&mut rng) % 200) as u64;
|
||||
f.bytes_recv = 60 + (lcg_next(&mut rng) % 100) as u64;
|
||||
f.pkts_sent = 3 + (lcg_next(&mut rng) % 5) as u32;
|
||||
f.pkts_recv = 2;
|
||||
f.protocol = Protocol::Tcp;
|
||||
f.dst_port = if lcg_f64(&mut rng) < 0.7 { 22 } else { 3389 };
|
||||
f.src_subnet = bf_src;
|
||||
f.dst_subnet = 120;
|
||||
f.is_internal_dst = false;
|
||||
f.syn_flag = true; f.rst_flag = lcg_f64(&mut rng) < 0.8;
|
||||
f.payload_entropy = 2.0 + lcg_f64(&mut rng) * 1.5;
|
||||
}
|
||||
|
||||
// Exfiltration: large outbound at night
|
||||
for j in 0..10 {
|
||||
let idx = 400 + j * 4;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::Exfiltration;
|
||||
f.time_s = 7200.0 + j as f64 * 120.0; // 2-3am
|
||||
f.duration_ms = 5000.0 + lcg_f64(&mut rng) * 15000.0;
|
||||
f.bytes_sent = 500_000 + (lcg_next(&mut rng) % 2_000_000) as u64;
|
||||
f.bytes_recv = 200 + (lcg_next(&mut rng) % 500) as u64;
|
||||
f.pkts_sent = 300 + (lcg_next(&mut rng) % 500) as u32;
|
||||
f.pkts_recv = 5;
|
||||
f.protocol = Protocol::Tcp;
|
||||
f.dst_port = 443;
|
||||
f.src_subnet = 1;
|
||||
f.dst_subnet = 130;
|
||||
f.is_internal_dst = false;
|
||||
f.psh_flag = true;
|
||||
f.payload_entropy = 7.2 + lcg_f64(&mut rng) * 0.6;
|
||||
}
|
||||
|
||||
// C2Beacon: periodic small payloads to same external IP
|
||||
let c2_dst = 140u8;
|
||||
for j in 0..15 {
|
||||
let idx = 600 + j * 5;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::C2Beacon;
|
||||
f.time_s = 3600.0 * j as f64 / 15.0 * 24.0; // spread across day
|
||||
f.duration_ms = 50.0 + lcg_f64(&mut rng) * 100.0;
|
||||
f.bytes_sent = 80 + (lcg_next(&mut rng) % 150) as u64;
|
||||
f.bytes_recv = 60 + (lcg_next(&mut rng) % 120) as u64;
|
||||
f.pkts_sent = 1;
|
||||
f.pkts_recv = 1;
|
||||
f.protocol = Protocol::Tcp;
|
||||
f.dst_port = 443;
|
||||
f.src_subnet = 0;
|
||||
f.dst_subnet = c2_dst;
|
||||
f.is_internal_dst = false;
|
||||
f.payload_entropy = 6.5 + lcg_f64(&mut rng) * 1.0;
|
||||
}
|
||||
|
||||
// DDoS: high packet rate from many sources to single target
|
||||
for j in 0..18 {
|
||||
let idx = 900 + j * 2;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::DDoS;
|
||||
f.time_s = 43200.0 + j as f64 * 0.3; // noon burst
|
||||
f.duration_ms = 1.0 + lcg_f64(&mut rng) * 5.0;
|
||||
f.bytes_sent = 40 + (lcg_next(&mut rng) % 80) as u64;
|
||||
f.bytes_recv = 0;
|
||||
f.pkts_sent = 50 + (lcg_next(&mut rng) % 200) as u32;
|
||||
f.pkts_recv = 0;
|
||||
f.protocol = if lcg_f64(&mut rng) < 0.6 { Protocol::Udp } else { Protocol::Tcp };
|
||||
f.dst_port = 80;
|
||||
f.src_subnet = (lcg_next(&mut rng) % 5) as u8;
|
||||
f.dst_subnet = 110;
|
||||
f.is_internal_dst = false;
|
||||
f.syn_flag = true; f.ack_flag = false;
|
||||
f.payload_entropy = 1.0 + lcg_f64(&mut rng) * 1.0;
|
||||
}
|
||||
|
||||
// LateralMovement: internal-to-internal, unusual ports
|
||||
for j in 0..10 {
|
||||
let idx = 1200 + j * 3;
|
||||
if idx >= n { break; }
|
||||
let f = &mut flows[idx];
|
||||
f.truth = Attack::LateralMovement;
|
||||
f.time_s = 50400.0 + j as f64 * 60.0; // 2pm
|
||||
f.duration_ms = 200.0 + lcg_f64(&mut rng) * 1000.0;
|
||||
f.bytes_sent = 1000 + (lcg_next(&mut rng) % 10000) as u64;
|
||||
f.bytes_recv = 500 + (lcg_next(&mut rng) % 5000) as u64;
|
||||
f.pkts_sent = 10 + (lcg_next(&mut rng) % 30) as u32;
|
||||
f.pkts_recv = 8 + (lcg_next(&mut rng) % 20) as u32;
|
||||
f.protocol = Protocol::Tcp;
|
||||
f.dst_port = 445 + (lcg_next(&mut rng) % 100) as u16; // SMB-ish
|
||||
f.src_subnet = j as u8 % 5;
|
||||
f.dst_subnet = (j as u8 + 1) % 5;
|
||||
f.is_internal_dst = true;
|
||||
f.psh_flag = true;
|
||||
f.payload_entropy = 5.5 + lcg_f64(&mut rng) * 1.5;
|
||||
}
|
||||
|
||||
flows.sort_by(|a, b| a.time_s.partial_cmp(&b.time_s).unwrap());
|
||||
for (i, f) in flows.iter_mut().enumerate() { f.index = i; }
|
||||
flows
|
||||
}
|
||||
|
||||
fn extract_embedding(f: &Flow) -> Vec<f32> {
|
||||
let log_dur = (f.duration_ms.max(1.0)).ln() as f32 / 10.0;
|
||||
let log_bs = (f.bytes_sent.max(1) as f64).ln() as f32 / 15.0;
|
||||
let log_br = (f.bytes_recv.max(1) as f64).ln() as f32 / 15.0;
|
||||
let pkt_ratio = if f.pkts_sent + f.pkts_recv > 0 {
|
||||
f.pkts_sent as f32 / (f.pkts_sent + f.pkts_recv) as f32
|
||||
} else { 0.5 };
|
||||
// Protocol one-hot
|
||||
let (p_tcp, p_udp, p_icmp) = match f.protocol {
|
||||
Protocol::Tcp => (1.0f32, 0.0, 0.0),
|
||||
Protocol::Udp => (0.0, 1.0, 0.0),
|
||||
Protocol::Icmp => (0.0, 0.0, 1.0),
|
||||
};
|
||||
// Port category one-hot
|
||||
let pc = port_category(f.dst_port);
|
||||
let (pw, ps, pd, pr, po) = match pc {
|
||||
"web"=>(1.0f32,0.0,0.0,0.0,0.0), "ssh"=>(0.0,1.0,0.0,0.0,0.0),
|
||||
"dns"=>(0.0,0.0,1.0,0.0,0.0), "rdp"=>(0.0,0.0,0.0,1.0,0.0),
|
||||
_=>(0.0,0.0,0.0,0.0,1.0),
|
||||
};
|
||||
let internal = if f.is_internal_dst { 1.0f32 } else { 0.0 };
|
||||
let subnet_enc = f.src_subnet as f32 / 5.0;
|
||||
// Time cyclic
|
||||
let hour_rad = f.time_s as f32 / 86400.0 * 2.0 * std::f32::consts::PI;
|
||||
let t_sin = hour_rad.sin();
|
||||
let t_cos = hour_rad.cos();
|
||||
let dow = 0.5f32; // mid-week placeholder
|
||||
let entropy = f.payload_entropy as f32 / 8.0;
|
||||
let avg_pkt = if f.pkts_sent + f.pkts_recv > 0 {
|
||||
(f.bytes_sent + f.bytes_recv) as f32 / (f.pkts_sent + f.pkts_recv) as f32 / 1500.0
|
||||
} else { 0.0 };
|
||||
let bpp = if f.bytes_sent > 0 && f.pkts_sent > 0 {
|
||||
f.bytes_sent as f32 / f.pkts_sent as f32 / 1500.0
|
||||
} else { 0.0 };
|
||||
// Flag pattern features
|
||||
let syn_flood = if f.syn_flag && !f.ack_flag { 1.0f32 } else { 0.0 };
|
||||
let rst_ind = if f.rst_flag { 1.0f32 } else { 0.0 };
|
||||
// Derived
|
||||
let vol = log_bs + log_br;
|
||||
let asym = (log_bs - log_br).abs();
|
||||
|
||||
vec![
|
||||
log_dur, log_bs, log_br, pkt_ratio,
|
||||
p_tcp, p_udp, p_icmp,
|
||||
pw, ps, pd, pr, po,
|
||||
internal, subnet_enc,
|
||||
t_sin, t_cos, dow,
|
||||
entropy, avg_pkt, bpp,
|
||||
syn_flood, rst_ind,
|
||||
vol, asym,
|
||||
f.pkts_sent as f32 / 100.0, f.pkts_recv as f32 / 100.0,
|
||||
(f.duration_ms as f32 / 1000.0).min(5.0),
|
||||
(f.bytes_sent as f64 / f.duration_ms.max(1.0)) as f32 / 1000.0,
|
||||
entropy * syn_flood, entropy * asym,
|
||||
log_dur * rst_ind, vol * internal,
|
||||
]
|
||||
}
|
||||
|
||||
fn cosine_sim(a: &[f32], b: &[f32]) -> f64 {
|
||||
let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64);
|
||||
for i in 0..a.len().min(b.len()) {
|
||||
d += a[i] as f64 * b[i] as f64;
|
||||
na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2);
|
||||
}
|
||||
let dn = na.sqrt() * nb.sqrt();
|
||||
if dn < 1e-15 { 0.0 } else { d / dn }
|
||||
}
|
||||
|
||||
fn anomaly_score(f: &Flow) -> f64 {
|
||||
let vol = ((f.bytes_sent + f.bytes_recv).max(1) as f64).ln();
|
||||
let ent = f.payload_entropy;
|
||||
let dur = (f.duration_ms.max(1.0)).ln();
|
||||
let pps = if f.duration_ms > 0.0 { f.pkts_sent as f64 / (f.duration_ms / 1000.0) } else { 0.0 };
|
||||
let syn_no_ack = if f.syn_flag && !f.ack_flag { 1.0 } else { 0.0 };
|
||||
let rst_p = if f.rst_flag { 0.5 } else { 0.0 };
|
||||
let night = if f.time_s < 21600.0 || f.time_s > 79200.0 { 0.3 } else { 0.0 };
|
||||
let high_vol = if vol > 12.0 { (vol - 12.0) * 0.3 } else { 0.0 };
|
||||
let low_pkt = if f.bytes_sent > 0 && f.pkts_sent <= 1 && f.bytes_recv == 0 { 0.4 } else { 0.0 };
|
||||
let high_pps = if pps > 50.0 { (pps - 50.0) * 0.01 } else { 0.0 };
|
||||
let high_ent = if ent > 6.5 { (ent - 6.5) * 0.5 } else { 0.0 };
|
||||
let short_burst = if dur < 2.0 && f.pkts_sent > 10 { 0.5 } else { 0.0 };
|
||||
|
||||
0.2 * syn_no_ack + rst_p + night + high_vol + low_pkt + high_pps + high_ent + short_burst - 0.4
|
||||
}
|
||||
|
||||
struct Edge { from: usize, to: usize, weight: f64 }
|
||||
|
||||
fn build_graph(flows: &[Flow], embs: &[Vec<f32>], alpha: f64, beta: f64, k: usize) -> Vec<Edge> {
|
||||
let m = flows.len();
|
||||
let mut edges = Vec::new();
|
||||
|
||||
// Temporal chain: consecutive flows from same source subnet
|
||||
for i in 1..m {
|
||||
if flows[i].src_subnet == flows[i - 1].src_subnet {
|
||||
let dt = (flows[i].time_s - flows[i - 1].time_s).abs();
|
||||
let tw = alpha * (-dt / 300.0).exp(); // decay over 5min
|
||||
if tw > 1e-6 {
|
||||
edges.push(Edge { from: i - 1, to: i, weight: tw });
|
||||
edges.push(Edge { from: i, to: i - 1, weight: tw });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Same-destination smoothing
|
||||
for i in 0..m {
|
||||
for j in (i + 1)..m.min(i + 50) {
|
||||
if flows[i].dst_subnet == flows[j].dst_subnet && flows[i].dst_port == flows[j].dst_port {
|
||||
edges.push(Edge { from: i, to: j, weight: alpha * 0.5 });
|
||||
edges.push(Edge { from: j, to: i, weight: alpha * 0.5 });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// kNN similarity from embeddings
|
||||
for i in 0..m {
|
||||
let mut sims: Vec<(usize, f64)> = (0..m)
|
||||
.filter(|&j| j != i && (j as isize - i as isize).unsigned_abs() > 3)
|
||||
.map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0))).collect();
|
||||
sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
for &(j, s) in sims.iter().take(k) {
|
||||
if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); }
|
||||
}
|
||||
}
|
||||
|
||||
edges
|
||||
}
|
||||
|
||||
fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec<bool> {
|
||||
let m = lambdas.len();
|
||||
let (s, t, n) = (m, m + 1, m + 2);
|
||||
let mut adj: Vec<Vec<(usize, usize)>> = vec![Vec::new(); n];
|
||||
let mut caps: Vec<f64> = Vec::new();
|
||||
let ae = |adj: &mut Vec<Vec<(usize, usize)>>, caps: &mut Vec<f64>, u: usize, v: usize, c: f64| {
|
||||
let idx = caps.len(); caps.push(c); caps.push(0.0);
|
||||
adj[u].push((v, idx)); adj[v].push((u, idx + 1));
|
||||
};
|
||||
for i in 0..m {
|
||||
let p0 = lambdas[i].max(0.0);
|
||||
let p1 = (-lambdas[i]).max(0.0);
|
||||
if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); }
|
||||
if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); }
|
||||
}
|
||||
for e in edges {
|
||||
let c = gamma * e.weight;
|
||||
if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); }
|
||||
}
|
||||
loop {
|
||||
let mut par: Vec<Option<(usize, usize)>> = vec![None; n];
|
||||
let mut vis = vec![false; n];
|
||||
let mut q = std::collections::VecDeque::new();
|
||||
vis[s] = true; q.push_back(s);
|
||||
while let Some(u) = q.pop_front() {
|
||||
if u == t { break; }
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !vis[v] && caps[ei] > 1e-15 { vis[v] = true; par[v] = Some((u, ei)); q.push_back(v); }
|
||||
}
|
||||
}
|
||||
if !vis[t] { break; }
|
||||
let mut bn = f64::MAX;
|
||||
let mut v = t;
|
||||
while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; }
|
||||
v = t;
|
||||
while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; }
|
||||
}
|
||||
let mut reach = vec![false; n];
|
||||
let mut stk = vec![s]; reach[s] = true;
|
||||
while let Some(u) = stk.pop() {
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); }
|
||||
}
|
||||
}
|
||||
(0..m).map(|i| reach[i]).collect()
|
||||
}
|
||||
|
||||
fn threshold_detect(flows: &[Flow]) -> Vec<bool> {
|
||||
// Simple baseline: volume > 3 sigma or entropy > 6.0
|
||||
let vols: Vec<f64> = flows.iter().map(|f| (f.bytes_sent + f.bytes_recv) as f64).collect();
|
||||
let mean = vols.iter().sum::<f64>() / vols.len() as f64;
|
||||
let std = (vols.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / vols.len() as f64).sqrt();
|
||||
flows.iter().enumerate().map(|(i, f)| {
|
||||
vols[i] > mean + 3.0 * std || f.payload_entropy > 6.0
|
||||
}).collect()
|
||||
}
|
||||
|
||||
struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 }
|
||||
|
||||
fn evaluate(flows: &[Flow], preds: &[bool]) -> Metrics {
|
||||
let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0);
|
||||
for (f, &p) in flows.iter().zip(preds) {
|
||||
let truth = f.truth != Attack::Normal;
|
||||
match (p, truth) {
|
||||
(true, true) => tp += 1, (true, false) => fp += 1,
|
||||
(false, false) => tn += 1, (false, true) => fn_ += 1,
|
||||
}
|
||||
}
|
||||
let prec = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 };
|
||||
let rec = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 };
|
||||
let f1 = if prec + rec > 0.0 { 2.0 * prec * rec / (prec + rec) } else { 0.0 };
|
||||
let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 };
|
||||
Metrics { precision: prec, recall: rec, f1, fpr }
|
||||
}
|
||||
|
||||
fn per_attack_detection(flows: &[Flow], preds: &[bool]) -> Vec<(&'static str, usize, usize)> {
|
||||
let attacks = [
|
||||
Attack::PortScan, Attack::BruteForce, Attack::Exfiltration,
|
||||
Attack::C2Beacon, Attack::DDoS, Attack::LateralMovement,
|
||||
];
|
||||
attacks.iter().map(|at| {
|
||||
let total = flows.iter().filter(|f| f.truth == *at).count();
|
||||
let det = flows.iter().zip(preds).filter(|(f, &p)| f.truth == *at && p).count();
|
||||
(at.label(), det, total)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() }
|
||||
|
||||
fn main() {
|
||||
println!("=== Cyber Threat Detection via Graph Cut + RuVector ===\n");
|
||||
|
||||
let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.5, 3usize);
|
||||
let tmp = TempDir::new().expect("tmpdir");
|
||||
let opts = RvfOptions { dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default() };
|
||||
let mut store = RvfStore::create(&tmp.path().join("cyber_flows.rvnet"), opts).expect("create");
|
||||
|
||||
println!(" Flows: {} | Dim: {} | alpha={} beta={} gamma={} k={}\n",
|
||||
N_FLOWS, DIM, alpha, beta, gamma, k_nn);
|
||||
|
||||
let flows = generate_flows(N_FLOWS, 42);
|
||||
let n_attack = flows.iter().filter(|f| f.truth != Attack::Normal).count();
|
||||
println!(" Generated: {} flows, {} attacks ({:.1}%)",
|
||||
N_FLOWS, n_attack, n_attack as f64 / N_FLOWS as f64 * 100.0);
|
||||
|
||||
println!("\n Attack Distribution:");
|
||||
for at in &[Attack::PortScan, Attack::BruteForce, Attack::Exfiltration,
|
||||
Attack::C2Beacon, Attack::DDoS, Attack::LateralMovement] {
|
||||
let c = flows.iter().filter(|f| f.truth == *at).count();
|
||||
if c > 0 { println!(" {:>12}: {:>3}", at.label(), c); }
|
||||
}
|
||||
|
||||
// Extract embeddings
|
||||
let embs: Vec<Vec<f32>> = flows.iter().map(|f| extract_embedding(f)).collect();
|
||||
|
||||
// Build graph and solve
|
||||
let lam: Vec<f64> = flows.iter().map(|f| anomaly_score(f)).collect();
|
||||
let edges = build_graph(&flows, &embs, alpha, beta, k_nn);
|
||||
println!("\n Graph: {} edges ({:.1} per flow)", edges.len(), edges.len() as f64 / N_FLOWS as f64);
|
||||
|
||||
let gc_preds = solve_mincut(&lam, &edges, gamma);
|
||||
let th_preds = threshold_detect(&flows);
|
||||
|
||||
let gc_m = evaluate(&flows, &gc_preds);
|
||||
let th_m = evaluate(&flows, &th_preds);
|
||||
|
||||
println!("\n {:>12} {:>8} {:>8} {:>8} {:>8}", "Method", "Prec", "Recall", "F1", "FPR");
|
||||
println!(" {:->12} {:->8} {:->8} {:->8} {:->8}", "", "", "", "", "");
|
||||
println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", "Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr);
|
||||
println!(" {:>12} {:>8.3} {:>8.3} {:>8.3} {:>8.3}", "Threshold", th_m.precision, th_m.recall, th_m.f1, th_m.fpr);
|
||||
|
||||
println!("\n Per-Attack Detection (Graph Cut vs Threshold):");
|
||||
println!(" {:>12} {:>8} {:>10}", "Attack", "GC", "Threshold");
|
||||
println!(" {:->12} {:->8} {:->10}", "", "", "");
|
||||
let gc_pa = per_attack_detection(&flows, &gc_preds);
|
||||
let th_pa = per_attack_detection(&flows, &th_preds);
|
||||
for (gc, th) in gc_pa.iter().zip(th_pa.iter()) {
|
||||
println!(" {:>12} {:>3}/{:<3} {:>5}/{:<3}",
|
||||
gc.0, gc.1, gc.2, th.1, th.2);
|
||||
}
|
||||
|
||||
// RVF ingestion
|
||||
println!("\n--- RVF Ingestion ---");
|
||||
let mut vecs = Vec::new();
|
||||
let mut ids = Vec::new();
|
||||
let mut meta = Vec::new();
|
||||
for (i, f) in flows.iter().enumerate() {
|
||||
vecs.push(embs[i].clone());
|
||||
ids.push(i as u64);
|
||||
meta.push(MetadataEntry { field_id: FIELD_PROTOCOL,
|
||||
value: MetadataValue::String(f.protocol.label().into()) });
|
||||
meta.push(MetadataEntry { field_id: FIELD_PORT_CAT,
|
||||
value: MetadataValue::String(port_category(f.dst_port).into()) });
|
||||
meta.push(MetadataEntry { field_id: FIELD_SUBNET,
|
||||
value: MetadataValue::U64(f.src_subnet as u64) });
|
||||
meta.push(MetadataEntry { field_id: FIELD_ATTACK,
|
||||
value: MetadataValue::String(f.truth.label().into()) });
|
||||
}
|
||||
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
|
||||
let ing = store.ingest_batch(&refs, &ids, Some(&meta)).expect("ingest");
|
||||
println!(" Ingested: {} (rejected: {})", ing.accepted, ing.rejected);
|
||||
|
||||
// Filtered queries
|
||||
println!("\n--- RVF Queries ---");
|
||||
// Find flows similar to a known portscan
|
||||
let scan_idx = flows.iter().position(|f| f.truth == Attack::PortScan).unwrap_or(0);
|
||||
let qv = &embs[scan_idx];
|
||||
let res = store.query(qv, 10, &QueryOptions::default()).expect("q");
|
||||
println!(" PortScan-similar (flow {}): {} results", scan_idx, res.len());
|
||||
for r in res.iter().take(5) {
|
||||
let at = flows[r.id as usize].truth.label();
|
||||
println!(" id={:>5} dist={:.4} attack={}", r.id, r.distance, at);
|
||||
}
|
||||
|
||||
// Filter by protocol
|
||||
let ft = FilterExpr::Eq(FIELD_PROTOCOL, FilterValue::String("TCP".into()));
|
||||
let tr = store.query(qv, 5, &QueryOptions { filter: Some(ft), ..Default::default() }).expect("q");
|
||||
println!(" TCP-only: {} results", tr.len());
|
||||
|
||||
let fu = FilterExpr::Eq(FIELD_PROTOCOL, FilterValue::String("UDP".into()));
|
||||
let ur = store.query(qv, 5, &QueryOptions { filter: Some(fu), ..Default::default() }).expect("q");
|
||||
println!(" UDP-only: {} results", ur.len());
|
||||
|
||||
// Witness chain: incident response audit trail
|
||||
println!("\n--- Witness Chain (Incident Response) ---");
|
||||
let steps = [
|
||||
("capture", 0x08u8), ("normalize", 0x02), ("embed", 0x02),
|
||||
("graph_build", 0x02), ("mincut_solve", 0x02), ("classify", 0x02),
|
||||
("alert", 0x01), ("forensics", 0x02), ("rvf_ingest", 0x08),
|
||||
("query", 0x02), ("report", 0x01), ("seal", 0x01),
|
||||
];
|
||||
let entries: Vec<WitnessEntry> = steps.iter().enumerate().map(|(i, (s, wt))| WitnessEntry {
|
||||
prev_hash: [0u8; 32],
|
||||
action_hash: shake256_256(format!("cyber:{}:{}", s, i).as_bytes()),
|
||||
timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000,
|
||||
witness_type: *wt,
|
||||
}).collect();
|
||||
let chain = create_witness_chain(&entries);
|
||||
let verified = verify_witness_chain(&chain).expect("verify");
|
||||
println!(" {} entries, {} bytes, VALID", verified.len(), chain.len());
|
||||
for (i, (s, _)) in steps.iter().enumerate() {
|
||||
let wn = match verified[i].witness_type { 0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????" };
|
||||
println!(" [{:>4}] {:>2} -> {}", wn, i, s);
|
||||
}
|
||||
|
||||
// Lineage
|
||||
println!("\n--- Lineage ---");
|
||||
let child = store.derive(&tmp.path().join("alerts.rvnet"), DerivationType::Filter, None).expect("derive");
|
||||
let lineage_depth = child.lineage_depth();
|
||||
println!(" parent: {} -> child: {} (depth {})",
|
||||
hex(store.file_id()), hex(child.parent_id()), lineage_depth);
|
||||
child.close().expect("close");
|
||||
|
||||
// Summary
|
||||
println!("\n=== Summary ===");
|
||||
println!(" {} flows | {} attacks ({:.1}%) | {} embeddings ingested",
|
||||
N_FLOWS, n_attack, n_attack as f64 / N_FLOWS as f64 * 100.0, ing.accepted);
|
||||
println!(" Graph Cut F1={:.3} Prec={:.3} Rec={:.3} FPR={:.3}",
|
||||
gc_m.f1, gc_m.precision, gc_m.recall, gc_m.fpr);
|
||||
println!(" Threshold F1={:.3} Prec={:.3} Rec={:.3} FPR={:.3}",
|
||||
th_m.f1, th_m.precision, th_m.recall, th_m.fpr);
|
||||
println!(" Witness: {} steps | Lineage depth: {}", verified.len(), lineage_depth);
|
||||
|
||||
store.close().expect("close");
|
||||
println!("\nDone.");
|
||||
}
|
||||
518
examples/rvf/examples/financial_fraud_graphcut.rs
Normal file
518
examples/rvf/examples/financial_fraud_graphcut.rs
Normal file
|
|
@ -0,0 +1,518 @@
|
|||
//! Financial Fraud Detection via Graph Cut / MRF Optimization + RuVector
|
||||
//!
|
||||
//! Applies the same MRF/mincut formulation used in genomic and medical pipelines
|
||||
//! to financial transaction fraud detection:
|
||||
//!
|
||||
//! 1. Generate ~2000 synthetic transactions with realistic distributions
|
||||
//! 2. Inject fraud patterns: card-not-present, account takeover, card clone,
|
||||
//! synthetic identity, refund abuse (~0.5% fraud rate)
|
||||
//! 3. Extract 32-dim embeddings, build transaction graph, solve s-t mincut
|
||||
//! 4. Compare graph cut vs simple threshold baseline
|
||||
//! 5. Store embeddings in RVF with merchant/amount metadata, witness chain
|
||||
//!
|
||||
//! Run: cargo run --example financial_fraud_graphcut --release
|
||||
|
||||
use rvf_runtime::{FilterExpr, MetadataEntry, MetadataValue, QueryOptions, RvfOptions, RvfStore};
|
||||
use rvf_runtime::filter::FilterValue;
|
||||
use rvf_runtime::options::DistanceMetric;
|
||||
use rvf_types::DerivationType;
|
||||
use rvf_crypto::{create_witness_chain, verify_witness_chain, shake256_256, WitnessEntry};
|
||||
use tempfile::TempDir;
|
||||
|
||||
const DIM: usize = 32;
|
||||
const N_TX: usize = 2000;
|
||||
const FIELD_MERCHANT: u16 = 0;
|
||||
const FIELD_AMOUNT_BUCKET: u16 = 1;
|
||||
const FIELD_FRAUD_TYPE: u16 = 2;
|
||||
|
||||
fn lcg_next(s: &mut u64) -> u64 {
|
||||
*s = s.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); *s
|
||||
}
|
||||
fn lcg_f64(s: &mut u64) -> f64 { lcg_next(s); (*s >> 11) as f64 / ((1u64 << 53) as f64) }
|
||||
fn lcg_normal(s: &mut u64) -> f64 {
|
||||
let u1 = lcg_f64(s).max(1e-15); let u2 = lcg_f64(s);
|
||||
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum FraudType { Legit, CardNotPresent, AccountTakeover, CardClone, Synthetic, Refund }
|
||||
impl FraudType {
|
||||
fn label(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Legit => "legit", Self::CardNotPresent => "CNP",
|
||||
Self::AccountTakeover => "ATO", Self::CardClone => "clone",
|
||||
Self::Synthetic => "synth", Self::Refund => "refund",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const MERCHANTS: [&str; 10] = [
|
||||
"grocery", "gas", "restaurant", "online_retail", "travel",
|
||||
"electronics", "pharmacy", "subscription", "atm", "luxury",
|
||||
];
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Transaction {
|
||||
id: usize,
|
||||
card_id: usize,
|
||||
amount: f64,
|
||||
hour: f64,
|
||||
merchant_cat: usize,
|
||||
card_present: bool,
|
||||
geo_distance_km: f64,
|
||||
velocity: f64,
|
||||
v_features: [f64; 10],
|
||||
fraud: FraudType,
|
||||
}
|
||||
|
||||
fn generate_transactions(n: usize, seed: u64) -> Vec<Transaction> {
|
||||
let mut rng = seed;
|
||||
let n_cards = 200;
|
||||
let mut txs = Vec::with_capacity(n);
|
||||
let mut card_last_hour = vec![0.0f64; n_cards];
|
||||
let mut card_tx_count = vec![0usize; n_cards];
|
||||
let mut card_last_geo = vec![0.0f64; n_cards];
|
||||
|
||||
for i in 0..n {
|
||||
let card_id = (lcg_next(&mut rng) as usize) % n_cards;
|
||||
// Log-normal amount: median ~$50, mean ~$88
|
||||
let log_amt = 3.9 + lcg_normal(&mut rng) * 0.8;
|
||||
let amount = log_amt.exp().min(5000.0);
|
||||
// Time of day: bimodal (lunch 12h, evening 19h)
|
||||
let hour = if lcg_f64(&mut rng) < 0.6 {
|
||||
12.0 + lcg_normal(&mut rng) * 3.0
|
||||
} else {
|
||||
19.0 + lcg_normal(&mut rng) * 2.5
|
||||
}.rem_euclid(24.0);
|
||||
let merchant_cat = (lcg_next(&mut rng) as usize) % 10;
|
||||
let card_present = lcg_f64(&mut rng) < 0.7;
|
||||
let home_geo = (card_id as f64 * 7.3) % 180.0;
|
||||
let geo_distance_km = (lcg_f64(&mut rng) * 15.0).max(0.1);
|
||||
let dt = hour - card_last_hour[card_id];
|
||||
let velocity = if dt.abs() > 0.1 {
|
||||
card_tx_count[card_id] as f64 / dt.abs().max(0.5)
|
||||
} else { card_tx_count[card_id] as f64 + 1.0 };
|
||||
|
||||
// PCA-like features V1-V10 from real distribution shapes
|
||||
let mut vf = [0.0f64; 10];
|
||||
for k in 0..10 {
|
||||
vf[k] = lcg_normal(&mut rng) * (1.0 / (k as f64 + 1.0).sqrt());
|
||||
}
|
||||
|
||||
card_last_hour[card_id] = hour;
|
||||
card_last_geo[card_id] = home_geo;
|
||||
card_tx_count[card_id] += 1;
|
||||
|
||||
txs.push(Transaction {
|
||||
id: i, card_id, amount, hour, merchant_cat, card_present,
|
||||
geo_distance_km, velocity, v_features: vf, fraud: FraudType::Legit,
|
||||
});
|
||||
}
|
||||
|
||||
// Inject fraud (~0.5% = ~10 transactions across 5 types)
|
||||
let fraud_types = [
|
||||
FraudType::CardNotPresent, FraudType::AccountTakeover,
|
||||
FraudType::CardClone, FraudType::Synthetic, FraudType::Refund,
|
||||
];
|
||||
for ft in &fraud_types {
|
||||
let count = 2;
|
||||
for _ in 0..count {
|
||||
let idx = (lcg_next(&mut rng) as usize) % n;
|
||||
let tx = &mut txs[idx];
|
||||
tx.fraud = *ft;
|
||||
match ft {
|
||||
FraudType::CardNotPresent => {
|
||||
tx.card_present = false;
|
||||
tx.amount = (tx.amount * 3.5 + 200.0).min(4500.0);
|
||||
tx.hour = 3.0 + lcg_f64(&mut rng) * 3.0; // 3-6 AM
|
||||
tx.merchant_cat = 3; // online_retail
|
||||
tx.v_features[0] += 2.5; tx.v_features[1] -= 1.8;
|
||||
}
|
||||
FraudType::AccountTakeover => {
|
||||
tx.velocity = 8.0 + lcg_f64(&mut rng) * 5.0; // burst
|
||||
tx.amount = 500.0 + lcg_f64(&mut rng) * 2000.0;
|
||||
tx.v_features[2] += 3.0; tx.v_features[3] -= 2.0;
|
||||
}
|
||||
FraudType::CardClone => {
|
||||
tx.geo_distance_km = 800.0 + lcg_f64(&mut rng) * 5000.0; // impossible travel
|
||||
tx.card_present = true;
|
||||
tx.v_features[4] += 2.8; tx.v_features[5] -= 1.5;
|
||||
}
|
||||
FraudType::Synthetic => {
|
||||
tx.amount = 50.0 + lcg_f64(&mut rng) * 150.0; // gradual escalation
|
||||
tx.merchant_cat = 9; // luxury
|
||||
tx.v_features[6] += 2.0; tx.v_features[7] += 1.5;
|
||||
}
|
||||
FraudType::Refund => {
|
||||
tx.amount = -(20.0 + lcg_f64(&mut rng) * 80.0); // refund (negative)
|
||||
tx.v_features[8] -= 2.5; tx.v_features[9] += 1.8;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
txs
|
||||
}
|
||||
|
||||
fn extract_embedding(tx: &Transaction, mean_amt: f64, std_amt: f64) -> Vec<f32> {
|
||||
let log_amt = (tx.amount.abs() + 1.0).ln();
|
||||
let hour_sin = (tx.hour * std::f64::consts::PI / 12.0).sin();
|
||||
let hour_cos = (tx.hour * std::f64::consts::PI / 12.0).cos();
|
||||
let mut cat_oh = [0.0f32; 10];
|
||||
cat_oh[tx.merchant_cat] = 1.0;
|
||||
let cp_flag = if tx.card_present { 1.0f32 } else { 0.0 };
|
||||
let geo_log = (tx.geo_distance_km + 1.0).ln();
|
||||
let vel_log = (tx.velocity + 1.0).ln();
|
||||
let amt_zscore = if std_amt > 1e-6 { (tx.amount - mean_amt) / std_amt } else { 0.0 };
|
||||
let amt_vel = log_amt * vel_log;
|
||||
let geo_time = geo_log * (tx.hour / 24.0);
|
||||
|
||||
let mut emb = Vec::with_capacity(DIM);
|
||||
emb.push(log_amt as f32); // 0
|
||||
emb.push(hour_sin as f32); // 1
|
||||
emb.push(hour_cos as f32); // 2
|
||||
for v in &cat_oh { emb.push(*v); } // 3-12
|
||||
emb.push(cp_flag); // 13
|
||||
emb.push(geo_log as f32); // 14
|
||||
emb.push(vel_log as f32); // 15
|
||||
emb.push(amt_zscore as f32); // 16
|
||||
for k in 0..10 { emb.push(tx.v_features[k] as f32); } // 17-26
|
||||
emb.push(amt_vel as f32); // 27
|
||||
emb.push(geo_time as f32); // 28
|
||||
emb.push(if tx.amount < 0.0 { 1.0 } else { 0.0 }); // 29 refund flag
|
||||
emb.push((tx.id as f64 / N_TX as f64) as f32); // 30 temporal position
|
||||
while emb.len() < DIM { emb.push(0.0); }
|
||||
emb.truncate(DIM);
|
||||
emb
|
||||
}
|
||||
|
||||
fn cosine_sim(a: &[f32], b: &[f32]) -> f64 {
|
||||
let (mut d, mut na, mut nb) = (0.0f64, 0.0f64, 0.0f64);
|
||||
for i in 0..a.len().min(b.len()) {
|
||||
d += a[i] as f64 * b[i] as f64;
|
||||
na += (a[i] as f64).powi(2); nb += (b[i] as f64).powi(2);
|
||||
}
|
||||
let dn = na.sqrt() * nb.sqrt();
|
||||
if dn < 1e-15 { 0.0 } else { d / dn }
|
||||
}
|
||||
|
||||
fn unary_anomaly_score(tx: &Transaction, mean_amt: f64, std_amt: f64) -> f64 {
|
||||
let amt_dev = ((tx.amount - mean_amt) / std_amt.max(1e-6)).abs();
|
||||
let time_anom = if tx.hour < 5.0 || tx.hour > 23.0 { 0.8 } else { 0.0 };
|
||||
let vel_spike = if tx.velocity > 5.0 { (tx.velocity - 5.0) * 0.3 } else { 0.0 };
|
||||
let geo_imp = if tx.geo_distance_km > 500.0 {
|
||||
(tx.geo_distance_km / 1000.0).min(2.0)
|
||||
} else { 0.0 };
|
||||
let refund_flag = if tx.amount < 0.0 { 0.6 } else { 0.0 };
|
||||
0.4 * amt_dev + 0.5 * time_anom + 0.6 * vel_spike + 0.8 * geo_imp + refund_flag - 0.7
|
||||
}
|
||||
|
||||
struct Edge { from: usize, to: usize, weight: f64 }
|
||||
|
||||
fn build_graph(txs: &[Transaction], embs: &[Vec<f32>],
|
||||
alpha: f64, beta: f64, k: usize) -> Vec<Edge> {
|
||||
let m = txs.len();
|
||||
let mut edges = Vec::new();
|
||||
|
||||
// Temporal chain: consecutive transactions by same card
|
||||
let mut by_card: Vec<Vec<usize>> = vec![Vec::new(); 200];
|
||||
for (i, tx) in txs.iter().enumerate() { by_card[tx.card_id].push(i); }
|
||||
for card_txs in &by_card {
|
||||
for w in card_txs.windows(2) {
|
||||
edges.push(Edge { from: w[0], to: w[1], weight: alpha });
|
||||
edges.push(Edge { from: w[1], to: w[0], weight: alpha });
|
||||
}
|
||||
}
|
||||
|
||||
// Merchant-category edges: connect same-merchant transactions (sampled)
|
||||
let mut by_merchant: Vec<Vec<usize>> = vec![Vec::new(); 10];
|
||||
for (i, tx) in txs.iter().enumerate() { by_merchant[tx.merchant_cat].push(i); }
|
||||
for mtxs in &by_merchant {
|
||||
for w in mtxs.windows(2) {
|
||||
edges.push(Edge { from: w[0], to: w[1], weight: alpha * 0.3 });
|
||||
edges.push(Edge { from: w[1], to: w[0], weight: alpha * 0.3 });
|
||||
}
|
||||
}
|
||||
|
||||
// kNN similarity edges from embeddings
|
||||
for i in 0..m {
|
||||
let mut sims: Vec<(usize, f64)> = (0..m)
|
||||
.filter(|&j| j != i)
|
||||
.map(|j| (j, cosine_sim(&embs[i], &embs[j]).max(0.0)))
|
||||
.collect();
|
||||
sims.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
||||
for &(j, s) in sims.iter().take(k) {
|
||||
if s > 0.1 { edges.push(Edge { from: i, to: j, weight: beta * s }); }
|
||||
}
|
||||
}
|
||||
edges
|
||||
}
|
||||
|
||||
fn solve_mincut(lambdas: &[f64], edges: &[Edge], gamma: f64) -> Vec<bool> {
|
||||
let m = lambdas.len();
|
||||
let (s, t, n) = (m, m + 1, m + 2);
|
||||
let mut adj: Vec<Vec<(usize, usize)>> = vec![Vec::new(); n];
|
||||
let mut caps: Vec<f64> = Vec::new();
|
||||
let ae = |adj: &mut Vec<Vec<(usize, usize)>>, caps: &mut Vec<f64>,
|
||||
u: usize, v: usize, c: f64| {
|
||||
let idx = caps.len(); caps.push(c); caps.push(0.0);
|
||||
adj[u].push((v, idx)); adj[v].push((u, idx + 1));
|
||||
};
|
||||
for i in 0..m {
|
||||
let p0 = lambdas[i].max(0.0);
|
||||
let p1 = (-lambdas[i]).max(0.0);
|
||||
if p0 > 1e-12 { ae(&mut adj, &mut caps, s, i, p0); }
|
||||
if p1 > 1e-12 { ae(&mut adj, &mut caps, i, t, p1); }
|
||||
}
|
||||
for e in edges {
|
||||
let c = gamma * e.weight;
|
||||
if c > 1e-12 { ae(&mut adj, &mut caps, e.from, e.to, c); }
|
||||
}
|
||||
loop {
|
||||
let mut par: Vec<Option<(usize, usize)>> = vec![None; n];
|
||||
let mut vis = vec![false; n];
|
||||
let mut q = std::collections::VecDeque::new();
|
||||
vis[s] = true; q.push_back(s);
|
||||
while let Some(u) = q.pop_front() {
|
||||
if u == t { break; }
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !vis[v] && caps[ei] > 1e-15 {
|
||||
vis[v] = true; par[v] = Some((u, ei)); q.push_back(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !vis[t] { break; }
|
||||
let mut bn = f64::MAX;
|
||||
let mut v = t;
|
||||
while let Some((u, ei)) = par[v] { bn = bn.min(caps[ei]); v = u; }
|
||||
v = t;
|
||||
while let Some((u, ei)) = par[v] { caps[ei] -= bn; caps[ei ^ 1] += bn; v = u; }
|
||||
}
|
||||
let mut reach = vec![false; n];
|
||||
let mut stk = vec![s]; reach[s] = true;
|
||||
while let Some(u) = stk.pop() {
|
||||
for &(v, ei) in &adj[u] {
|
||||
if !reach[v] && caps[ei] > 1e-15 { reach[v] = true; stk.push(v); }
|
||||
}
|
||||
}
|
||||
(0..m).map(|i| reach[i]).collect()
|
||||
}
|
||||
|
||||
fn threshold_detect(txs: &[Transaction], mean_amt: f64, std_amt: f64) -> Vec<bool> {
|
||||
txs.iter().map(|tx| {
|
||||
let amt_z = ((tx.amount - mean_amt) / std_amt.max(1e-6)).abs();
|
||||
amt_z > 3.0 || tx.velocity > 5.0 || tx.geo_distance_km > 500.0 || tx.amount < 0.0
|
||||
}).collect()
|
||||
}
|
||||
|
||||
struct Metrics { precision: f64, recall: f64, f1: f64, fpr: f64 }
|
||||
|
||||
fn evaluate(txs: &[Transaction], preds: &[bool]) -> Metrics {
|
||||
let (mut tp, mut fp, mut tn, mut fn_) = (0u64, 0, 0, 0);
|
||||
for (tx, &p) in txs.iter().zip(preds) {
|
||||
let truth = tx.fraud != FraudType::Legit;
|
||||
match (p, truth) {
|
||||
(true, true) => tp += 1, (true, false) => fp += 1,
|
||||
(false, false) => tn += 1, (false, true) => fn_ += 1,
|
||||
}
|
||||
}
|
||||
let precision = if tp + fp > 0 { tp as f64 / (tp + fp) as f64 } else { 0.0 };
|
||||
let recall = if tp + fn_ > 0 { tp as f64 / (tp + fn_) as f64 } else { 0.0 };
|
||||
let f1 = if precision + recall > 0.0 { 2.0 * precision * recall / (precision + recall) } else { 0.0 };
|
||||
let fpr = if tn + fp > 0 { fp as f64 / (tn + fp) as f64 } else { 0.0 };
|
||||
Metrics { precision, recall, f1, fpr }
|
||||
}
|
||||
|
||||
fn per_type_detection(txs: &[Transaction], preds: &[bool]) -> Vec<(&'static str, usize, usize)> {
|
||||
let types = [
|
||||
FraudType::CardNotPresent, FraudType::AccountTakeover,
|
||||
FraudType::CardClone, FraudType::Synthetic, FraudType::Refund,
|
||||
];
|
||||
types.iter().map(|ft| {
|
||||
let total = txs.iter().filter(|tx| tx.fraud == *ft).count();
|
||||
let detected = txs.iter().zip(preds).filter(|(tx, &p)| tx.fraud == *ft && p).count();
|
||||
(ft.label(), detected, total)
|
||||
}).collect()
|
||||
}
|
||||
|
||||
fn amount_bucket(amt: f64) -> &'static str {
|
||||
let a = amt.abs();
|
||||
if a < 25.0 { "micro" } else if a < 100.0 { "small" }
|
||||
else if a < 500.0 { "medium" } else if a < 2000.0 { "large" }
|
||||
else { "xlarge" }
|
||||
}
|
||||
|
||||
fn hex(b: &[u8]) -> String { b.iter().map(|x| format!("{:02x}", x)).collect() }
|
||||
|
||||
fn main() {
|
||||
println!("=== Financial Fraud Detection via Graph Cut / MRF ===\n");
|
||||
|
||||
let (alpha, beta, gamma, k_nn) = (0.25, 0.12, 0.35, 3usize);
|
||||
let seed = 42u64;
|
||||
|
||||
// Generate transactions
|
||||
let txs = generate_transactions(N_TX, seed);
|
||||
let n_fraud = txs.iter().filter(|tx| tx.fraud != FraudType::Legit).count();
|
||||
let fraud_rate = n_fraud as f64 / N_TX as f64 * 100.0;
|
||||
println!(" Transactions: {} | Fraud: {} ({:.2}%) | Cards: 200\n", N_TX, n_fraud, fraud_rate);
|
||||
|
||||
// Distribution stats
|
||||
let amounts: Vec<f64> = txs.iter().map(|tx| tx.amount).collect();
|
||||
let mean_amt = amounts.iter().sum::<f64>() / amounts.len() as f64;
|
||||
let std_amt = (amounts.iter().map(|a| (a - mean_amt).powi(2)).sum::<f64>()
|
||||
/ amounts.len() as f64).sqrt();
|
||||
let pos_amounts: Vec<f64> = amounts.iter().filter(|&&a| a > 0.0).cloned().collect();
|
||||
let mut sorted_pos = pos_amounts.clone();
|
||||
sorted_pos.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
let median = sorted_pos[sorted_pos.len() / 2];
|
||||
println!(" Amount stats: median=${:.0}, mean=${:.0}, std=${:.0}", median, mean_amt, std_amt);
|
||||
|
||||
println!(" Fraud breakdown:");
|
||||
for ft in &[FraudType::CardNotPresent, FraudType::AccountTakeover,
|
||||
FraudType::CardClone, FraudType::Synthetic, FraudType::Refund] {
|
||||
let c = txs.iter().filter(|tx| tx.fraud == *ft).count();
|
||||
if c > 0 { println!(" {}: {}", ft.label(), c); }
|
||||
}
|
||||
|
||||
// Extract embeddings
|
||||
let embs: Vec<Vec<f32>> = txs.iter()
|
||||
.map(|tx| extract_embedding(tx, mean_amt, std_amt)).collect();
|
||||
|
||||
// Unary anomaly scores
|
||||
let lambdas: Vec<f64> = txs.iter()
|
||||
.map(|tx| unary_anomaly_score(tx, mean_amt, std_amt)).collect();
|
||||
|
||||
// Build graph and solve mincut
|
||||
println!("\n Building transaction graph...");
|
||||
let edges = build_graph(&txs, &embs, alpha, beta, k_nn);
|
||||
println!(" Graph: {} nodes, {} edges", N_TX, edges.len());
|
||||
|
||||
let gc_preds = solve_mincut(&lambdas, &edges, gamma);
|
||||
let gc_flagged = gc_preds.iter().filter(|&&p| p).count();
|
||||
|
||||
let th_preds = threshold_detect(&txs, mean_amt, std_amt);
|
||||
let th_flagged = th_preds.iter().filter(|&&p| p).count();
|
||||
|
||||
// Evaluate
|
||||
let gc_m = evaluate(&txs, &gc_preds);
|
||||
let th_m = evaluate(&txs, &th_preds);
|
||||
|
||||
println!("\n {:>12} {:>9} {:>9} {:>9} {:>9} {:>7}",
|
||||
"Method", "Prec", "Recall", "F1", "FPR", "Flagged");
|
||||
println!(" {:->12} {:->9} {:->9} {:->9} {:->9} {:->7}", "", "", "", "", "", "");
|
||||
println!(" {:>12} {:>9.3} {:>9.3} {:>9.3} {:>9.4} {:>7}",
|
||||
"Graph Cut", gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr, gc_flagged);
|
||||
println!(" {:>12} {:>9.3} {:>9.3} {:>9.3} {:>9.4} {:>7}",
|
||||
"Threshold", th_m.precision, th_m.recall, th_m.f1, th_m.fpr, th_flagged);
|
||||
|
||||
let f1_imp = if th_m.f1 > 0.0 { (gc_m.f1 - th_m.f1) / th_m.f1 * 100.0 } else { 0.0 };
|
||||
let fpr_imp = if th_m.fpr > 0.0 { (th_m.fpr - gc_m.fpr) / th_m.fpr * 100.0 } else { 0.0 };
|
||||
println!("\n Graph cut vs threshold: F1 {:+.1}%, FPR reduction {:.1}%", f1_imp, fpr_imp);
|
||||
|
||||
// Per-type detection
|
||||
println!("\n Per-fraud-type detection:");
|
||||
println!(" {:>8} {:>12} {:>12}", "Type", "GraphCut", "Threshold");
|
||||
println!(" {:->8} {:->12} {:->12}", "", "", "");
|
||||
let gc_types = per_type_detection(&txs, &gc_preds);
|
||||
let th_types = per_type_detection(&txs, &th_preds);
|
||||
for (gc_t, th_t) in gc_types.iter().zip(th_types.iter()) {
|
||||
println!(" {:>8} {:>5}/{:<5} {:>5}/{:<5}",
|
||||
gc_t.0, gc_t.1, gc_t.2, th_t.1, th_t.2);
|
||||
}
|
||||
|
||||
// RVF ingestion
|
||||
println!("\n--- RVF Ingestion ---");
|
||||
let tmp = TempDir::new().expect("tmpdir");
|
||||
let opts = RvfOptions {
|
||||
dimension: DIM as u16, metric: DistanceMetric::Cosine, ..Default::default()
|
||||
};
|
||||
let mut store = RvfStore::create(&tmp.path().join("fraud.rvfin"), opts).expect("create");
|
||||
|
||||
let mut vecs: Vec<Vec<f32>> = Vec::new();
|
||||
let mut ids: Vec<u64> = Vec::new();
|
||||
let mut meta: Vec<MetadataEntry> = Vec::new();
|
||||
|
||||
for (i, tx) in txs.iter().enumerate() {
|
||||
vecs.push(embs[i].clone());
|
||||
ids.push(i as u64);
|
||||
meta.push(MetadataEntry {
|
||||
field_id: FIELD_MERCHANT,
|
||||
value: MetadataValue::String(MERCHANTS[tx.merchant_cat].into()),
|
||||
});
|
||||
meta.push(MetadataEntry {
|
||||
field_id: FIELD_AMOUNT_BUCKET,
|
||||
value: MetadataValue::String(amount_bucket(tx.amount).into()),
|
||||
});
|
||||
meta.push(MetadataEntry {
|
||||
field_id: FIELD_FRAUD_TYPE,
|
||||
value: MetadataValue::String(tx.fraud.label().into()),
|
||||
});
|
||||
}
|
||||
|
||||
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
|
||||
let ing = store.ingest_batch(&refs, &ids, Some(&meta)).expect("ingest");
|
||||
println!(" Ingested: {} (rejected: {})", ing.accepted, ing.rejected);
|
||||
|
||||
// Filtered queries: retrieve similar fraud patterns by merchant
|
||||
println!("\n--- Filtered Queries ---");
|
||||
let fraud_idx = txs.iter().position(|tx| tx.fraud != FraudType::Legit).unwrap_or(0);
|
||||
let qv = &embs[fraud_idx];
|
||||
let res = store.query(qv, 10, &QueryOptions::default()).expect("q");
|
||||
println!(" Similar to fraud tx #{}: {} results", fraud_idx, res.len());
|
||||
for r in res.iter().take(5) {
|
||||
let tx = &txs[r.id as usize];
|
||||
println!(" id={:>5} dist={:.4} merchant={:<15} fraud={}",
|
||||
r.id, r.distance, MERCHANTS[tx.merchant_cat], tx.fraud.label());
|
||||
}
|
||||
|
||||
let fm = FilterExpr::Eq(FIELD_MERCHANT, FilterValue::String("online_retail".into()));
|
||||
let mr = store.query(qv, 5, &QueryOptions { filter: Some(fm), ..Default::default() }).expect("q");
|
||||
println!(" Online-retail only: {} results", mr.len());
|
||||
|
||||
let ff = FilterExpr::Eq(FIELD_AMOUNT_BUCKET, FilterValue::String("large".into()));
|
||||
let lr = store.query(qv, 5, &QueryOptions { filter: Some(ff), ..Default::default() }).expect("q");
|
||||
println!(" Large-amount only: {} results", lr.len());
|
||||
|
||||
// Witness chain: audit trail
|
||||
println!("\n--- Witness Chain (Audit Trail) ---");
|
||||
let steps = [
|
||||
("tx_ingest", 0x08u8), ("feature_extract", 0x02), ("graph_build", 0x02),
|
||||
("mincut", 0x02), ("classify", 0x02), ("alert", 0x01), ("seal", 0x01),
|
||||
];
|
||||
let entries: Vec<WitnessEntry> = steps.iter().enumerate().map(|(i, (step, wt))| WitnessEntry {
|
||||
prev_hash: [0u8; 32],
|
||||
action_hash: shake256_256(format!("fraud_gc:{}:{}", step, i).as_bytes()),
|
||||
timestamp_ns: 1_700_000_000_000_000_000 + i as u64 * 1_000_000_000,
|
||||
witness_type: *wt,
|
||||
}).collect();
|
||||
let chain = create_witness_chain(&entries);
|
||||
let verified = verify_witness_chain(&chain).expect("verify");
|
||||
println!(" {} entries, {} bytes, VALID", verified.len(), chain.len());
|
||||
for (i, (step, _)) in steps.iter().enumerate() {
|
||||
let wn = match verified[i].witness_type {
|
||||
0x01 => "PROV", 0x02 => "COMP", 0x08 => "DATA", _ => "????",
|
||||
};
|
||||
println!(" [{:>4}] {:>2} -> {}", wn, i, step);
|
||||
}
|
||||
|
||||
// Lineage
|
||||
println!("\n--- Lineage ---");
|
||||
let child = store.derive(
|
||||
&tmp.path().join("fraud_report.rvfin"), DerivationType::Filter, None,
|
||||
).expect("derive");
|
||||
println!(" parent: {} -> child: {} (depth {})",
|
||||
hex(store.file_id()), hex(child.parent_id()), child.lineage_depth());
|
||||
child.close().expect("close");
|
||||
|
||||
// Summary
|
||||
println!("\n=== Summary ===");
|
||||
println!(" {} transactions, {} fraud ({:.2}%)", N_TX, n_fraud, fraud_rate);
|
||||
println!(" Graph cut: prec={:.3} recall={:.3} F1={:.3} FPR={:.4}",
|
||||
gc_m.precision, gc_m.recall, gc_m.f1, gc_m.fpr);
|
||||
println!(" Threshold: prec={:.3} recall={:.3} F1={:.3} FPR={:.4}",
|
||||
th_m.precision, th_m.recall, th_m.f1, th_m.fpr);
|
||||
println!(" RVF: {} embeddings | Witness: {} steps", ing.accepted, verified.len());
|
||||
println!(" alpha={:.2} beta={:.2} gamma={:.2} k={}", alpha, beta, gamma, k_nn);
|
||||
store.close().expect("close");
|
||||
println!("\nDone.");
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue