fix(brain): dramatically raise gist quality bar — real innovations only

Problem: gists still publishing recycled "X associated_with Y" noise.

Threshold changes:
- MIN_NEW_INFERENCES: 5 → 10
- MIN_EVIDENCE: 500 → 1000
- MIN_STRANGE_LOOP_SCORE: 0.05 → 0.1
- MIN_PROPOSITIONS: 10 → 20
- MIN_SONA_PATTERNS: 0 → 1 (require SONA learning)
- MIN_PARETO_GROWTH: 2 → 3
- MIN_INFERENCE_CONFIDENCE: 0.60 → 0.70
- New: MIN_UNIQUE_CATEGORIES = 4 (prevent recycling same domains)
- Rate limit: 24h → 72h (3 days between gists)
- Cross-domain similarity: 0.45 → 0.55

Quality filters:
- Reject ALL "may be associated with", "co-occurs with", "similar_to"
- Reject inferences < 50 chars
- Require 3+ strong inferences, 5+ strong propositions, 4+ unique categories
- Kill co_occurs_with and similar_to entirely from publishable set

Target: ~1 gist per week, only for genuinely novel cross-domain discoveries.

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
rUv 2026-03-25 22:26:55 +00:00
parent cd9d8ba2db
commit ea266ddaac
2 changed files with 70 additions and 35 deletions

View file

@ -15,23 +15,26 @@ use parking_lot::Mutex;
use serde::{Deserialize, Serialize};
// ── Novelty thresholds ──
// Tuned aggressively: only publish genuinely novel, high-confidence findings.
// Previous thresholds (2/100/0.008) allowed floods of "weak co-occurrence" noise.
// These gates should yield ~1 gist per day at most when substantive new data arrives.
/// Minimum new inferences: must derive non-trivial forward-chained claims
const MIN_NEW_INFERENCES: usize = 5;
/// Minimum evidence observations — need enough data for statistical significance
const MIN_EVIDENCE: usize = 500;
/// Minimum strange loop quality score — higher = more self-aware reasoning
const MIN_STRANGE_LOOP_SCORE: f32 = 0.05;
// VERY aggressive: only publish when something genuinely new is discovered.
// With ~3100 memories and 2.8M edges, the bar must be HIGH to avoid noise.
// Target: ~1 gist per WEEK, only for real innovations.
/// Minimum new inferences: must derive many non-trivial forward-chained claims
const MIN_NEW_INFERENCES: usize = 10;
/// Minimum evidence observations — need substantial data
const MIN_EVIDENCE: usize = 1000;
/// Minimum strange loop quality score — high bar for self-aware reasoning
const MIN_STRANGE_LOOP_SCORE: f32 = 0.1;
/// Minimum propositions extracted in this cycle
const MIN_PROPOSITIONS: usize = 10;
/// Minimum SONA patterns — 0 means SONA isn't required (it needs trajectory data)
const MIN_SONA_PATTERNS: usize = 0;
/// Minimum Pareto front growth — evolution must have found new solutions
const MIN_PARETO_GROWTH: usize = 2;
const MIN_PROPOSITIONS: usize = 20;
/// Minimum SONA patterns — require at least some SONA learning
const MIN_SONA_PATTERNS: usize = 1;
/// Minimum Pareto front growth — evolution must find multiple new solutions
const MIN_PARETO_GROWTH: usize = 3;
/// Minimum confidence for ANY inference to be included in a discovery
const MIN_INFERENCE_CONFIDENCE: f64 = 0.60;
const MIN_INFERENCE_CONFIDENCE: f64 = 0.70;
/// Minimum number of UNIQUE categories across strong propositions
/// (prevents "debug-architecture-geopolitics" recycling)
const MIN_UNIQUE_CATEGORIES: usize = 4;
/// A discovery worthy of publishing.
///
@ -80,18 +83,31 @@ impl Discovery {
/// Filter out weak/generic inferences, keeping only substantive ones.
/// Returns the strong inferences that survive the quality gate.
pub fn strong_inferences(&self) -> Vec<&str> {
// Known boring patterns that should never be published
let boring_patterns = [
"shows weak co-occurrence",
"may be associated with",
"co-occurs with",
"is_type_of",
"similar_to",
];
self.inferences.iter()
.filter(|inf| {
// Reject generic "weak co-occurrence" noise
let lower = inf.to_lowercase();
if lower.contains("shows weak co-occurrence") {
return false;
// Reject ALL known boring patterns
for pattern in &boring_patterns {
if lower.contains(pattern) { return false; }
}
// Reject inferences with generic cluster IDs as subjects
if lower.starts_with("cluster_") {
return false;
}
// Require minimum confidence (parse from explanation string)
// Reject inferences with generic cluster IDs
if lower.starts_with("cluster_") { return false; }
// Reject short/generic inferences
if inf.len() < 50 { return false; }
// Require HIGH confidence (parse from explanation string)
if let Some(pct_start) = lower.find("confidence: ") {
let rest = &lower[pct_start + 12..];
if let Some(pct_end) = rest.find('%') {
@ -100,38 +116,58 @@ impl Discovery {
}
}
}
// If we can't parse confidence, keep it only if it has substance
!lower.contains("weak") && inf.len() > 30
// Must not contain "weak" anywhere
!lower.contains("weak")
})
.map(|s| s.as_str())
.collect()
}
/// Filter propositions to only those with confidence >= threshold.
/// Filter propositions to only high-confidence, non-generic ones.
pub fn strong_propositions(&self) -> Vec<&(String, String, String, f64)> {
self.propositions.iter()
.filter(|(subj, pred, _obj, conf)| {
// Skip generic cluster labels
if subj.starts_with("cluster_") { return false; }
// Skip "co_occurs_with" at low confidence
if pred == "co_occurs_with" && *conf < 0.55 { return false; }
// Skip ALL co_occurs_with — these are never interesting
if pred == "co_occurs_with" { return false; }
// Skip similar_to within same domain — too obvious
if pred == "similar_to" { return false; }
// Only keep high-confidence cross-domain findings
*conf >= MIN_INFERENCE_CONFIDENCE
})
.collect()
}
/// Count unique categories across strong propositions.
fn category_diversity(&self) -> usize {
let mut cats = std::collections::HashSet::new();
for (subj, _, obj, conf) in &self.propositions {
if *conf >= MIN_INFERENCE_CONFIDENCE && !subj.starts_with("cluster_") {
cats.insert(subj.as_str());
cats.insert(obj.as_str());
}
}
cats.len()
}
/// Check if this discovery meets the novelty bar for publishing.
/// This is intentionally VERY strict — we want ~1 gist per week.
pub fn is_publishable(&self) -> bool {
let strong = self.strong_inferences();
let strong_props = self.strong_propositions();
let diversity = self.category_diversity();
self.new_inferences >= MIN_NEW_INFERENCES
&& self.evidence_count >= MIN_EVIDENCE
&& self.strange_loop_score >= MIN_STRANGE_LOOP_SCORE
&& self.propositions_extracted >= MIN_PROPOSITIONS
&& self.sona_patterns >= MIN_SONA_PATTERNS
&& self.pareto_growth >= MIN_PARETO_GROWTH
&& strong.len() >= 2 // Must have at least 2 non-trivial inferences
&& strong_props.len() >= 3 // Must have at least 3 substantive propositions
&& strong.len() >= 3 // Must have at least 3 non-trivial inferences
&& strong_props.len() >= 5 // Must have at least 5 substantive propositions
&& diversity >= MIN_UNIQUE_CATEGORIES // Must span multiple domains
}
/// Explain why a discovery was or wasn't published.
@ -192,7 +228,7 @@ impl GistPublisher {
Some(Self {
token,
last_publish: Mutex::new(None),
min_interval: Duration::from_secs(86400), // 24 hour minimum between gists
min_interval: Duration::from_secs(259200), // 3 day minimum between gists
published_count: Mutex::new(0),
published_titles: Mutex::new(Vec::new()),
})

View file

@ -423,10 +423,9 @@ impl NeuralSymbolicBridge {
let sim = cosine_similarity(c1, c2);
let cross_domain = cat1 != cat2;
// Skip weak signals — raised from 0.3 to 0.45 to eliminate
// the flood of "weak co-occurrence" noise in gist publications.
// At 0.3, nearly every category pair generates a proposition.
if sim < 0.45 {
// Skip weak signals — raised to 0.55 to eliminate noise.
// Only extract propositions for genuinely similar cross-domain clusters.
if sim < 0.55 {
continue;
}