feat(compiler): bounded trial, confidence gating, 2-failure quarantine

Three-fix iteration based on ablation diagnostics:

1. Bounded trial: Strategy Zero now caps trial budget at min(avg_steps*2,
   external_limit/4) with floor of 10 steps. Makes false hits cheap
   (max 100 steps overhead instead of full compiled budget).

2. Confidence gating: Strategy Zero only attempts when config confidence
   >= 0.7 (Laplace-smoothed success rate). Compiled observations from
   training seed initial confidence so configs start trusted.

3. 2-failure quarantine: any compiled signature with 2+ false hits is
   disabled (expected_correct=false). Prevents persistent bad patterns.

Additional changes:
- Versioned signature prefix (v1:difficulty:constraints) for cache
  safety across refactors
- CompiledSolveConfig gains avg_steps, observations, confidence(),
  trial_budget() methods
- KnowledgeCompiler gains steps_saved tracking, confidence_threshold,
  print_diagnostics() for per-signature analysis
- record_success now tracks actual steps for delta-cost calculation
- Verbose mode prints full compiler diagnostics after each ablation

Results: false hit rate dropped from 8.2% to 4.4% (PASS). Cost still
net-positive because constraint-determined search ranges are 1-10 dates
— structurally no room for compiler optimization. Next: PolicyKernel
constraint ordering for real cost surface.

81 tests passing.

https://claude.ai/code/session_01RnwD4x5cbpB7FPvoyYQz8G
This commit is contained in:
Claude 2026-02-15 22:01:46 +00:00
parent 84f5249633
commit 05bfff45da
2 changed files with 129 additions and 27 deletions

View file

@ -506,6 +506,11 @@ pub fn run_acceptance_test_mode(config: &HoldoutConfig, mode: &AblationMode) ->
0.0
};
// Print compiler diagnostics in verbose mode
if config.verbose && compiler_enabled {
compiler.print_diagnostics();
}
Ok(AblationResult {
mode: mode.clone(),
result: acceptance_result,

View file

@ -501,18 +501,50 @@ pub struct CompiledSolveConfig {
pub use_rewriting: bool,
/// Minimum steps that succeeded for this signature
pub max_steps: usize,
/// Average steps across all successes (for bounded trial budget)
pub avg_steps: f64,
/// Number of successful observations compiled
pub observations: usize,
/// Expected correctness
pub expected_correct: bool,
/// Stop after first solution (early termination for known single-solution puzzles)
pub stop_after_first: bool,
/// Hit count (how often this config was used)
/// Hit count (how often this config was used and succeeded)
pub hit_count: usize,
/// Counterexample count (failures on this signature)
pub counterexample_count: usize,
}
impl CompiledSolveConfig {
/// Confidence: Laplace-smoothed success rate.
pub fn confidence(&self) -> f64 {
let total = self.hit_count + self.counterexample_count;
if total == 0 { return 0.5; }
(self.hit_count as f64 + 1.0) / (total as f64 + 2.0)
}
/// Trial budget: bounded step limit for Strategy Zero.
/// Uses avg_steps * 2.0 as budget (enough headroom for variance),
/// with a floor of max_steps and a ceiling of 25% of external limit.
pub fn trial_budget(&self, external_limit: usize) -> usize {
let budget = if self.observations > 2 && self.avg_steps > 1.0 {
// Enough data: use 2x average steps for headroom
(self.avg_steps * 2.0) as usize
} else {
// Not enough data or trivially small: use max observed steps
self.max_steps.max(10)
};
budget.max(10).min(external_limit / 4)
}
}
/// KnowledgeCompiler: learns constraint-signature → optimal solve config.
/// Consulted as "Strategy Zero" before any other strategy runs.
///
/// Signature version: v1 (difficulty:sorted_constraints)
/// Change this when canonicalization rules change.
const COMPILER_SIG_VERSION: &str = "v1";
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct KnowledgeCompiler {
/// Compiled constraint signature → config
@ -523,18 +555,28 @@ pub struct KnowledgeCompiler {
pub misses: usize,
/// False hits (compiled config tried but solve was wrong)
pub false_hits: usize,
/// Steps saved by successful Strategy Zero (vs estimated fallback cost)
pub steps_saved: i64,
/// Confidence threshold for attempting Strategy Zero
pub confidence_threshold: f64,
}
impl KnowledgeCompiler {
pub fn new() -> Self { Self::default() }
pub fn new() -> Self {
Self {
confidence_threshold: 0.7,
..Default::default()
}
}
/// Build constraint signature from puzzle features.
/// Includes version prefix for cache safety across refactors.
pub fn signature(puzzle: &TemporalPuzzle) -> String {
let mut sig_parts: Vec<String> = puzzle.constraints.iter()
.map(|c| constraint_type_name(c))
.collect();
sig_parts.sort();
format!("{}:{}", puzzle.difficulty, sig_parts.join(","))
format!("{}:{}:{}", COMPILER_SIG_VERSION, puzzle.difficulty, sig_parts.join(","))
}
/// Compile knowledge from a ReasoningBank's trajectories.
@ -543,22 +585,30 @@ impl KnowledgeCompiler {
let correct = traj.verdict.as_ref().map(|v| v.is_success()).unwrap_or(false);
if !correct { continue; }
// Build signature from constraint types
// Build signature from constraint types (versioned)
let mut sig_parts = traj.constraint_types.clone();
sig_parts.sort();
let sig = format!("{}:{}", traj.difficulty, sig_parts.join(","));
let sig = format!("{}:{}:{}", COMPILER_SIG_VERSION, traj.difficulty, sig_parts.join(","));
if let Some(attempt) = traj.attempts.first() {
let entry = self.signature_cache.entry(sig).or_insert(CompiledSolveConfig {
use_rewriting: true,
max_steps: attempt.steps,
avg_steps: 0.0,
observations: 0,
expected_correct: true,
stop_after_first: true, // compiled configs use early termination
stop_after_first: true,
hit_count: 0,
counterexample_count: 0,
});
// Keep minimum steps that succeeded
entry.max_steps = entry.max_steps.min(attempt.steps);
// Running average of steps
let n = entry.observations as f64;
entry.avg_steps = (entry.avg_steps * n + attempt.steps as f64) / (n + 1.0);
entry.observations += 1;
// Compiled from successful trajectories → seed confidence
entry.hit_count = entry.observations;
}
}
}
@ -577,27 +627,32 @@ impl KnowledgeCompiler {
}
/// Record a counterexample: Strategy Zero failed on this signature.
/// Quarantine escalation: 2 false hits → disable the entry.
pub fn record_failure(&mut self, puzzle: &TemporalPuzzle) {
self.false_hits += 1;
let sig = Self::signature(puzzle);
if let Some(config) = self.signature_cache.get_mut(&sig) {
config.counterexample_count += 1;
// If failure rate exceeds 30%, invalidate the cache entry
if config.hit_count > 0 {
let fail_rate = config.counterexample_count as f64
/ (config.hit_count + config.counterexample_count) as f64;
if fail_rate > 0.30 {
config.expected_correct = false;
}
// 2-failure quarantine: disable after 2 false hits
if config.counterexample_count >= 2 {
config.expected_correct = false;
}
}
}
/// Record a success: Strategy Zero worked on this signature.
pub fn record_success(&mut self, puzzle: &TemporalPuzzle) {
/// Record a successful Strategy Zero hit.
/// Tracks steps saved vs estimated fallback cost.
pub fn record_success(&mut self, puzzle: &TemporalPuzzle, actual_steps: usize) {
let sig = Self::signature(puzzle);
if let Some(config) = self.signature_cache.get_mut(&sig) {
config.hit_count += 1;
// Estimate fallback cost as avg_steps * 2 (full scan is typically ~2x early-term)
let estimated_fallback = if config.avg_steps > 0.0 {
(config.avg_steps * 2.0) as i64
} else {
config.max_steps as i64
};
self.steps_saved += estimated_fallback - actual_steps as i64;
}
}
@ -607,6 +662,39 @@ impl KnowledgeCompiler {
}
pub fn cache_size(&self) -> usize { self.signature_cache.len() }
/// Print diagnostic summary: per-signature stats, false hit distribution.
pub fn print_diagnostics(&self) {
println!();
println!(" Compiler Diagnostics (cache_size={})", self.cache_size());
println!(" {:<40} {:>5} {:>5} {:>6} {:>8} {:>6}",
"Signature", "Obs", "Hits", "Fails", "AvgStep", "Conf");
println!(" {}", "-".repeat(72));
let mut entries: Vec<_> = self.signature_cache.iter().collect();
entries.sort_by(|a, b| b.1.counterexample_count.cmp(&a.1.counterexample_count));
for (sig, config) in entries.iter().take(15) {
let short_sig = if sig.len() > 38 { &sig[..38] } else { sig };
println!(" {:<40} {:>5} {:>5} {:>6} {:>7.1} {:>.3}",
short_sig, config.observations, config.hit_count,
config.counterexample_count, config.avg_steps,
config.confidence());
}
// Summary
let total_configs = self.signature_cache.len();
let disabled = self.signature_cache.values().filter(|c| !c.expected_correct).count();
let total_false_hits: usize = self.signature_cache.values().map(|c| c.counterexample_count).sum();
let false_hit_sigs = self.signature_cache.values().filter(|c| c.counterexample_count > 0).count();
println!();
println!(" Total signatures: {}, disabled: {}", total_configs, disabled);
println!(" False hits: {} across {} signatures ({:.1}% of sigs)",
total_false_hits, false_hit_sigs,
if total_configs > 0 { false_hit_sigs as f64 / total_configs as f64 * 100.0 } else { 0.0 });
println!(" Steps saved by compiler: {}", self.steps_saved);
}
}
// ═══════════════════════════════════════════════════════════════════════════
@ -858,17 +946,26 @@ impl AdaptiveSolver {
let mut extra_steps: usize = 0;
let mut extra_tool_calls: usize = 0;
// ─── Strategy Zero: KnowledgeCompiler ───────────────────────────
// ─── Strategy Zero: KnowledgeCompiler (bounded trial) ────────────
if self.compiler_enabled {
if let Some(config) = self.compiler.lookup(puzzle) {
if config.expected_correct {
// Use compiled config as Strategy Zero with early termination
let compiled_steps = config.max_steps.max(5);
self.solver.calendar_tool = config.use_rewriting;
self.solver.stop_after_first = config.stop_after_first;
self.solver.max_steps = self.external_step_limit
.map(|l| l.min(compiled_steps))
.unwrap_or(compiled_steps);
let conf_threshold = self.compiler.confidence_threshold;
// Extract all config data before releasing the borrow
let compiled = self.compiler.lookup(puzzle).map(|config| {
(
config.expected_correct,
config.confidence(),
config.trial_budget(self.external_step_limit.unwrap_or(400)),
config.use_rewriting,
config.stop_after_first,
)
});
if let Some((expected_correct, confidence, trial_budget, use_rewriting, stop_first)) = compiled {
if expected_correct && confidence >= conf_threshold {
// Bounded trial: cap at 25% of external limit to make misses cheap
self.solver.calendar_tool = use_rewriting;
self.solver.stop_after_first = stop_first;
self.solver.max_steps = trial_budget;
let start = std::time::Instant::now();
let result = self.solver.solve(puzzle)?;
@ -879,7 +976,7 @@ impl AdaptiveSolver {
if result.correct {
// Strategy Zero win — record and return
self.compiler.record_success(puzzle);
self.compiler.record_success(puzzle, result.steps);
let mut trajectory = Trajectory::new(&puzzle.id, puzzle.difficulty);
trajectory.constraint_types = constraint_types;
trajectory.latency_ms = latency;
@ -903,7 +1000,7 @@ impl AdaptiveSolver {
return Ok(result);
} else {
// Strategy Zero failed — record overhead, fall through
// Strategy Zero failed — bounded trial overhead only
extra_steps += result.steps;
extra_tool_calls += result.tool_calls;
self.compiler.record_failure(puzzle);