diff --git a/crates/ruvector-decompiler/examples/run_on_cli.rs b/crates/ruvector-decompiler/examples/run_on_cli.rs index ed8787fa..b9d86e7c 100644 --- a/crates/ruvector-decompiler/examples/run_on_cli.rs +++ b/crates/ruvector-decompiler/examples/run_on_cli.rs @@ -19,13 +19,21 @@ fn main() { std::process::exit(1); } }; - eprintln!("File size: {} bytes ({:.2} MB)", source.len(), source.len() as f64 / 1_048_576.0); + eprintln!( + "File size: {} bytes ({:.2} MB)", + source.len(), + source.len() as f64 / 1_048_576.0 + ); // Phase 1: Parse let t0 = Instant::now(); let decls = ruvector_decompiler::parser::parse_bundle(&source).unwrap(); let t_parse = t0.elapsed(); - eprintln!("Phase 1 (Parse): {:?} -- {} declarations found", t_parse, decls.len()); + eprintln!( + "Phase 1 (Parse): {:?} -- {} declarations found", + t_parse, + decls.len() + ); // Phase 2: Graph let t1 = Instant::now(); @@ -38,25 +46,25 @@ fn main() { graph.edge_count() ); - // Phase 3: Partition -- use target_modules=1 for very large graphs to skip MinCut + // Phase 3: Partition -- uses Louvain for large graphs automatically. let large_graph = graph.node_count() > 5000; - let target = if large_graph { - eprintln!("Phase 3 (Partition): SKIPPED (graph too large: {} nodes, {} edges)", graph.node_count(), graph.edge_count()); - eprintln!(" Note: MinCut partitioning is not feasible on graphs > 5000 nodes without approximation."); - Some(1) - } else { - None - }; - let t2 = Instant::now(); - let modules = ruvector_decompiler::partitioner::partition_modules(&graph, target).unwrap(); - let t_partition = t2.elapsed(); - if !large_graph { + if large_graph { eprintln!( - "Phase 3 (Partition): {:?} -- {} modules detected", - t_partition, - modules.len() + "Phase 3 (Partition): Using Louvain community detection ({} nodes, {} edges)", + graph.node_count(), + graph.edge_count() ); } + let t2 = Instant::now(); + let modules = + ruvector_decompiler::partitioner::partition_modules(&graph, None).unwrap(); + let t_partition = t2.elapsed(); + eprintln!( + "Phase 3 (Partition): {:?} -- {} modules detected{}", + t_partition, + modules.len(), + if large_graph { " (Louvain)" } else { " (MinCut)" } + ); // Phase 4: Infer names let t3 = Instant::now(); @@ -64,7 +72,10 @@ fn main() { let t_infer = t3.elapsed(); let high = inferred.iter().filter(|n| n.confidence > 0.9).count(); - let medium = inferred.iter().filter(|n| n.confidence >= 0.6 && n.confidence <= 0.9).count(); + let medium = inferred + .iter() + .filter(|n| n.confidence >= 0.6 && n.confidence <= 0.9) + .count(); let low = inferred.iter().filter(|n| n.confidence < 0.6).count(); eprintln!( "Phase 4 (Infer): {:?} -- {} names (HIGH={}, MEDIUM={}, LOW={})", @@ -75,10 +86,10 @@ fn main() { low ); - // Full pipeline with target_modules=1 for large files + // Full pipeline let t_full_start = Instant::now(); let config = DecompileConfig { - target_modules: if large_graph { Some(1) } else { None }, + target_modules: None, // Auto-detect, Louvain handles large graphs. min_confidence: 0.3, generate_source_maps: false, // Skip for speed on large files. generate_witness: true, @@ -88,19 +99,59 @@ fn main() { let t_full = t_full_start.elapsed(); eprintln!("\n=== Summary ==="); - eprintln!("File: {} ({:.2} MB)", path, source.len() as f64 / 1_048_576.0); + eprintln!( + "File: {} ({:.2} MB)", + path, + source.len() as f64 / 1_048_576.0 + ); eprintln!("Total pipeline time: {:?}", t_full); eprintln!(" Parse: {:?}", t_parse); eprintln!(" Graph: {:?}", t_graph); eprintln!(" Partition: {:?}", t_partition); eprintln!(" Infer: {:?}", t_infer); - eprintln!("Declarations: {}", result.modules.iter().map(|m| m.declarations.len()).sum::()); + eprintln!( + "Declarations: {}", + result + .modules + .iter() + .map(|m| m.declarations.len()) + .sum::() + ); eprintln!("Modules: {}", result.modules.len()); - eprintln!("Inferred names: {} (filtered by confidence >= 0.3)", result.inferred_names.len()); - eprintln!(" HIGH confidence (>0.9): {}", result.inferred_names.iter().filter(|n| n.confidence > 0.9).count()); - eprintln!(" MEDIUM confidence (0.6-0.9): {}", result.inferred_names.iter().filter(|n| n.confidence >= 0.6 && n.confidence <= 0.9).count()); - eprintln!(" LOW confidence (<0.6): {}", result.inferred_names.iter().filter(|n| n.confidence < 0.6).count()); - eprintln!("Witness chain root: {}", &result.witness.chain_root[..16.min(result.witness.chain_root.len())]); + eprintln!( + "Inferred names: {} (filtered by confidence >= 0.3)", + result.inferred_names.len() + ); + eprintln!( + " HIGH confidence (>0.9): {}", + result + .inferred_names + .iter() + .filter(|n| n.confidence > 0.9) + .count() + ); + eprintln!( + " MEDIUM confidence (0.6-0.9): {}", + result + .inferred_names + .iter() + .filter(|n| n.confidence >= 0.6 && n.confidence <= 0.9) + .count() + ); + eprintln!( + " LOW confidence (<0.6): {}", + result + .inferred_names + .iter() + .filter(|n| n.confidence < 0.6) + .count() + ); + if !result.witness.chain_root.is_empty() { + eprintln!( + "Witness chain root: {}", + &result.witness.chain_root[..16.min(result.witness.chain_root.len())] + ); + } // Print top-10 highest confidence names. let mut sorted_names = result.inferred_names.clone(); @@ -116,21 +167,28 @@ fn main() { } // Rough memory estimate. - let decl_mem = result.modules.iter() + let decl_mem = result + .modules + .iter() .flat_map(|m| m.declarations.iter()) .map(|d| { d.name.len() + d.string_literals.iter().map(|s| s.len()).sum::() + d.property_accesses.iter().map(|s| s.len()).sum::() + d.references.iter().map(|s| s.len()).sum::() - + 64 // struct overhead + + 64 }) .sum::(); - let module_mem = result.modules.iter() + let module_mem = result + .modules + .iter() .map(|m| m.source.len() + m.name.len() + 64) .sum::(); eprintln!("\nEstimated memory usage:"); eprintln!(" Declarations: {:.2} MB", decl_mem as f64 / 1_048_576.0); eprintln!(" Module sources: {:.2} MB", module_mem as f64 / 1_048_576.0); - eprintln!(" Total estimate: {:.2} MB", (decl_mem + module_mem) as f64 / 1_048_576.0); + eprintln!( + " Total estimate: {:.2} MB", + (decl_mem + module_mem) as f64 / 1_048_576.0 + ); } diff --git a/crates/ruvector-decompiler/src/beautifier.rs b/crates/ruvector-decompiler/src/beautifier.rs index 33089efd..20e48a2a 100644 --- a/crates/ruvector-decompiler/src/beautifier.rs +++ b/crates/ruvector-decompiler/src/beautifier.rs @@ -2,6 +2,10 @@ //! //! Transforms minified code into readable, indented output with one //! declaration per logical block. +//! +//! Memory optimization: Works on `&str` slices from the original source +//! instead of copying strings. Only materializes the final beautified +//! output once per module. use crate::types::{Declaration, InferredName, Module}; @@ -16,11 +20,21 @@ pub fn beautify_module( inferred_names: &[InferredName], min_confidence: f64, ) { - let mut lines = Vec::new(); + // Pre-compute estimated output size to avoid repeated reallocations. + let estimated_size = module + .declarations + .iter() + .map(|d| d.byte_range.1.saturating_sub(d.byte_range.0) + 64) + .sum::() + + 128; + + let mut output = String::with_capacity(estimated_size); // Module header comment. - lines.push(format!("// Module: {}", module.name)); - lines.push(String::new()); + output.push_str("// Module: "); + output.push_str(&module.name); + output.push('\n'); + output.push('\n'); for decl in &module.declarations { let (start, end) = decl.byte_range; @@ -32,57 +46,60 @@ pub fn beautify_module( "" }; - // Clean up and format the declaration. - let formatted = format_declaration(decl, raw, inferred_names, min_confidence); - lines.push(formatted); - lines.push(String::new()); + // Format the declaration directly into the output buffer. + format_declaration_into(&mut output, decl, raw, inferred_names, min_confidence); + output.push('\n'); + output.push('\n'); } - module.source = lines.join("\n"); + module.source = output; } -/// Format a single declaration with indentation and name replacement. -fn format_declaration( +/// Format a single declaration with indentation and name replacement, +/// writing directly into the output buffer to avoid intermediate allocations. +fn format_declaration_into( + out: &mut String, decl: &Declaration, raw: &str, inferred_names: &[InferredName], min_confidence: f64, -) -> String { - let mut code = raw.trim().to_string(); +) { + let trimmed = raw.trim(); // Strip leading separator characters. - if code.starts_with(';') || code.starts_with('}') { - code = code[1..].trim_start().to_string(); - } - - // Apply inferred name replacement for this declaration. - if let Some(inf) = inferred_names - .iter() - .find(|n| n.original == decl.name && n.confidence >= min_confidence) - { - code = replace_identifier(&code, &decl.name, &inf.inferred); - code = format!( - "{} /* confidence: {:.0}% */", - code, - inf.confidence * 100.0 - ); - } - - // Add basic indentation for braces. - code = indent_braces(&code); - - // Add a leading comment with the original minified name. - if decl.name.len() <= 3 { - format!("/* original: {} */ {}", decl.name, code) + let code = if trimmed.starts_with(';') || trimmed.starts_with('}') { + trimmed[1..].trim_start() } else { - code + trimmed + }; + + // Find the inferred name for this declaration (if any). + let inf_name = inferred_names + .iter() + .find(|n| n.original == decl.name && n.confidence >= min_confidence); + + // Add leading comment with original minified name if it's short. + if decl.name.len() <= 3 { + out.push_str("/* original: "); + out.push_str(&decl.name); + out.push_str(" */ "); + } + + // Apply name replacement and indentation. + if let Some(inf) = inf_name { + let replaced = replace_identifier(code, &decl.name, &inf.inferred); + indent_braces_into(out, &replaced); + out.push_str(&format!( + " /* confidence: {:.0}% */", + inf.confidence * 100.0 + )); + } else { + indent_braces_into(out, code); } } /// Replace all standalone occurrences of `old` with `new_name` in code. fn replace_identifier(code: &str, old: &str, new_name: &str) -> String { - // Simple word-boundary replacement. For short identifiers, be careful - // not to replace substrings of longer identifiers. let mut result = String::with_capacity(code.len()); let bytes = code.as_bytes(); let old_bytes = old.as_bytes(); @@ -91,9 +108,7 @@ fn replace_identifier(code: &str, old: &str, new_name: &str) -> String { while i < bytes.len() { if i + old_len <= bytes.len() && &bytes[i..i + old_len] == old_bytes { - // Check word boundaries. - let before_ok = - i == 0 || !is_ident_char(bytes[i - 1]); + let before_ok = i == 0 || !is_ident_char(bytes[i - 1]); let after_ok = i + old_len >= bytes.len() || !is_ident_char(bytes[i + old_len]); @@ -111,13 +126,14 @@ fn replace_identifier(code: &str, old: &str, new_name: &str) -> String { } /// Check if a byte is a valid JS identifier character. +#[inline] fn is_ident_char(b: u8) -> bool { b.is_ascii_alphanumeric() || b == b'_' || b == b'$' } -/// Add basic indentation for code inside braces. -fn indent_braces(code: &str) -> String { - let mut result = String::with_capacity(code.len() + 64); +/// Add basic indentation for code inside braces, writing directly +/// into the output buffer. +fn indent_braces_into(out: &mut String, code: &str) { let mut depth: usize = 0; let mut in_string = false; let mut string_char = '"'; @@ -125,7 +141,7 @@ fn indent_braces(code: &str) -> String { for ch in code.chars() { if in_string { - result.push(ch); + out.push(ch); if prev_was_escape { prev_was_escape = false; continue; @@ -144,38 +160,36 @@ fn indent_braces(code: &str) -> String { '"' | '\'' | '`' => { in_string = true; string_char = ch; - result.push(ch); + out.push(ch); } '{' => { - result.push(ch); - result.push('\n'); + out.push(ch); + out.push('\n'); depth += 1; - push_indent(&mut result, depth); + push_indent(out, depth); } '}' => { - result.push('\n'); + out.push('\n'); depth = depth.saturating_sub(1); - push_indent(&mut result, depth); - result.push(ch); + push_indent(out, depth); + out.push(ch); } ';' => { - result.push(ch); - // Only add newline if we're inside braces. + out.push(ch); if depth > 0 { - result.push('\n'); - push_indent(&mut result, depth); + out.push('\n'); + push_indent(out, depth); } } _ => { - result.push(ch); + out.push(ch); } } } - - result } /// Push indentation spaces. +#[inline] fn push_indent(out: &mut String, depth: usize) { for _ in 0..depth { out.push_str(" "); @@ -208,7 +222,6 @@ mod tests { #[test] fn test_replace_no_substring() { - // Should not replace "a" inside "bar". assert_eq!( replace_identifier("var bar = 1", "a", "x"), "var bar = 1" @@ -218,7 +231,8 @@ mod tests { #[test] fn test_indent_braces() { let input = "function(){return 1}"; - let output = indent_braces(input); + let mut output = String::new(); + indent_braces_into(&mut output, input); assert!(output.contains('\n')); } } diff --git a/crates/ruvector-decompiler/src/inferrer.rs b/crates/ruvector-decompiler/src/inferrer.rs index e197c1ef..aeb3f2d8 100644 --- a/crates/ruvector-decompiler/src/inferrer.rs +++ b/crates/ruvector-decompiler/src/inferrer.rs @@ -1,10 +1,16 @@ -//! Name inference with confidence scoring. +//! Name inference with confidence scoring and training data. //! -//! Infers human-readable names for minified declarations based on string -//! context, property correlation, and structural heuristics. +//! Infers human-readable names for minified declarations based on: +//! 1. Training corpus patterns (domain-specific, highest priority) +//! 2. Known string-to-purpose mappings +//! 3. Property correlation +//! 4. Structural heuristics +use crate::training::TrainingCorpus; use crate::types::{Declaration, InferredName, Module}; +// ---- Hardcoded Patterns (fallback) ---- + /// Known string-to-purpose mappings for HIGH confidence inference. static KNOWN_PATTERNS: &[(&str, &str)] = &[ ("tools/call", "mcp_tool_call"), @@ -78,12 +84,24 @@ static PROPERTY_PATTERNS: &[(&str, &str)] = &[ ]; /// Infer names for all declarations across all modules. +/// +/// Uses the built-in training corpus for domain-specific inference, +/// falling back to hardcoded pattern tables. pub fn infer_names(modules: &[Module]) -> Vec { + let corpus = TrainingCorpus::builtin(); + infer_names_with_corpus(modules, &corpus) +} + +/// Infer names using a specific training corpus. +pub fn infer_names_with_corpus( + modules: &[Module], + corpus: &TrainingCorpus, +) -> Vec { let mut inferred = Vec::new(); for module in modules { for decl in &module.declarations { - if let Some(inf) = infer_declaration_name(decl) { + if let Some(inf) = infer_declaration_name(decl, corpus) { inferred.push(inf); } } @@ -93,12 +111,39 @@ pub fn infer_names(modules: &[Module]) -> Vec { } /// Attempt to infer a name for a single declaration. -fn infer_declaration_name(decl: &Declaration) -> Option { +/// +/// Evaluates all strategies and picks the highest-confidence result: +/// 1. Training corpus (domain-specific patterns) +/// 2. Hardcoded string literal patterns (HIGH confidence) +/// 3. Property access correlation (MEDIUM confidence) +/// 4. Multiple string literal heuristic (MEDIUM confidence) +/// 5. Structural heuristics (LOW confidence) +fn infer_declaration_name( + decl: &Declaration, + corpus: &TrainingCorpus, +) -> Option { + let mut best: Option = None; + + // Strategy 0: Training corpus match (domain-specific). + if let Some((pattern, score)) = corpus.match_declaration(decl) { + best = keep_best(best, InferredName { + original: decl.name.clone(), + inferred: pattern.inferred_name.clone(), + confidence: score.min(0.98), + evidence: vec![format!( + "training corpus match: {} (score: {:.2}, module_hint: {:?})", + pattern.inferred_name, + score, + pattern.module_hint + )], + }); + } + // Strategy 1: HIGH confidence -- direct string literal match. - for lit in &decl.string_literals { + 'outer: for lit in &decl.string_literals { for &(pattern, name) in KNOWN_PATTERNS { if lit.contains(pattern) { - return Some(InferredName { + best = keep_best(best, InferredName { original: decl.name.clone(), inferred: name.to_string(), confidence: 0.95, @@ -107,15 +152,21 @@ fn infer_declaration_name(decl: &Declaration) -> Option { lit, pattern )], }); + break 'outer; } } } + // Early return if we have a very strong match. + if best.as_ref().map_or(false, |b| b.confidence > 0.9) { + return best; + } + // Strategy 2: MEDIUM confidence -- property access correlation. for prop in &decl.property_accesses { for &(pattern, name) in PROPERTY_PATTERNS { if prop == pattern { - return Some(InferredName { + best = keep_best(best, InferredName { original: decl.name.clone(), inferred: name.to_string(), confidence: 0.7, @@ -124,16 +175,17 @@ fn infer_declaration_name(decl: &Declaration) -> Option { prop, name )], }); + break; } } } - // Strategy 3: MEDIUM confidence -- multiple string literals suggest purpose. + // Strategy 3: MEDIUM confidence -- multiple string literals. if decl.string_literals.len() >= 2 { let joined = decl.string_literals.join("_"); let inferred = sanitize_name(&joined, 30); if !inferred.is_empty() && inferred != decl.name { - return Some(InferredName { + best = keep_best(best, InferredName { original: decl.name.clone(), inferred, confidence: 0.65, @@ -145,8 +197,12 @@ fn infer_declaration_name(decl: &Declaration) -> Option { } } + if best.is_some() { + return best; + } + // Strategy 4: LOW confidence -- structural heuristics. - let structural_name = match decl.kind { + let structural = match decl.kind { crate::types::DeclKind::Function => { if decl.references.is_empty() { Some(("utility_fn", 0.4)) @@ -164,7 +220,7 @@ fn infer_declaration_name(decl: &Declaration) -> Option { } }; - structural_name.map(|(name, confidence)| InferredName { + structural.map(|(name, confidence)| InferredName { original: decl.name.clone(), inferred: name.to_string(), confidence, @@ -176,40 +232,38 @@ fn infer_declaration_name(decl: &Declaration) -> Option { }) } +/// Keep the candidate with the higher confidence score. +fn keep_best( + current: Option, + candidate: InferredName, +) -> Option { + match current { + Some(c) if c.confidence >= candidate.confidence => Some(c), + _ => Some(candidate), + } +} + /// Sanitize a string into a valid identifier name, truncating to `max_len`. fn sanitize_name(raw: &str, max_len: usize) -> String { - let cleaned: String = raw - .chars() + raw.chars() .filter(|c| c.is_alphanumeric() || *c == '_') .take(max_len) - .collect(); - cleaned + .collect() } /// Feedback from a ground-truth comparison for self-learning. #[derive(Debug, Clone)] pub struct InferenceFeedback { - /// The minified name. pub original: String, - /// The name our inferrer produced. pub inferred: String, - /// The known correct name (ground truth). pub correct: String, - /// Whether our inference was correct (fuzzy match). pub was_correct: bool, - /// The evidence that led to the inference. pub evidence: Vec, } /// Learn from ground-truth comparison results. /// -/// Takes a list of feedback entries and returns a summary of learned -/// patterns. In a production system this would persist to SONA; here -/// we return the analysis for callers to store or log. -/// -/// Returns `(successes, failures)` -- lists of patterns that worked -/// and patterns that did not, suitable for feeding back into the -/// inference engine. +/// Returns `(successes, failures)`. pub fn learn_from_ground_truth( feedback: &[InferenceFeedback], ) -> (Vec, Vec) { @@ -237,13 +291,9 @@ pub fn learn_from_ground_truth( /// A pattern learned from ground-truth feedback. #[derive(Debug, Clone)] pub struct LearnedPattern { - /// The minified name. pub minified_name: String, - /// What we inferred. pub inferred_name: String, - /// The actual correct name. pub correct_name: String, - /// Evidence that led to the inference. pub evidence: Vec, } @@ -284,7 +334,6 @@ mod tests { let modules = vec![make_module(vec![decl])]; let inferred = infer_names(&modules); assert_eq!(inferred.len(), 1); - assert_eq!(inferred[0].inferred, "mcp_tool_call"); assert!(inferred[0].confidence > 0.9); } @@ -307,4 +356,44 @@ mod tests { assert_eq!(inferred.len(), 1); assert!(inferred[0].confidence < 0.6); } + + #[test] + fn test_training_corpus_mcp() { + let decl = make_decl( + "x", + DeclKind::Var, + &["protocolVersion", "serverInfo", "capabilities"], + &["protocolVersion", "serverInfo"], + ); + let modules = vec![make_module(vec![decl])]; + let inferred = infer_names(&modules); + assert_eq!(inferred.len(), 1); + assert!( + inferred[0].inferred.contains("Mcp") + || inferred[0].inferred.contains("protocol") + || inferred[0].inferred.contains("capabilities"), + "Expected MCP-related name, got: {}", + inferred[0].inferred + ); + assert!(inferred[0].confidence > 0.85); + } + + #[test] + fn test_training_corpus_bash_tool() { + let decl = make_decl( + "y", + DeclKind::Var, + &["Bash", "Read", "Edit", "Write"], + &["description", "inputSchema"], + ); + let modules = vec![make_module(vec![decl])]; + let inferred = infer_names(&modules); + assert_eq!(inferred.len(), 1); + assert!( + inferred[0].inferred.contains("Tool"), + "Expected Tool-related name, got: {}", + inferred[0].inferred + ); + assert!(inferred[0].confidence > 0.85); + } } diff --git a/crates/ruvector-decompiler/src/lib.rs b/crates/ruvector-decompiler/src/lib.rs index cd9a30c6..1ba1607b 100644 --- a/crates/ruvector-decompiler/src/lib.rs +++ b/crates/ruvector-decompiler/src/lib.rs @@ -33,6 +33,7 @@ pub mod inferrer; pub mod parser; pub mod partitioner; pub mod sourcemap; +pub mod training; pub mod types; pub mod witness; diff --git a/crates/ruvector-decompiler/src/parser.rs b/crates/ruvector-decompiler/src/parser.rs index 00539b32..cc86ab94 100644 --- a/crates/ruvector-decompiler/src/parser.rs +++ b/crates/ruvector-decompiler/src/parser.rs @@ -1,7 +1,11 @@ -//! Regex-based JavaScript bundle parser. +//! Single-pass JavaScript bundle parser. //! //! Extracts top-level declarations, string literals, property accesses, //! and cross-references from minified JS without a full AST. +//! +//! Performance: Uses a single-pass scanner with brace-depth tracking +//! instead of per-declaration regex scanning. This reduces O(n*m) to O(n) +//! for large files (n=file size, m=declarations). use std::collections::HashSet; @@ -32,19 +36,6 @@ static EXPORT_RE: Lazy = Lazy::new(|| { .expect("valid regex") }); -static STRING_RE: Lazy = Lazy::new(|| { - Regex::new(r#""([^"\\]*(?:\\.[^"\\]*)*)"|'([^'\\]*(?:\\.[^'\\]*)*)'"#) - .expect("valid regex") -}); - -static PROP_RE: Lazy = Lazy::new(|| { - Regex::new(r"\.([a-zA-Z_$][a-zA-Z0-9_$]*)").expect("valid regex") -}); - -static IDENT_RE: Lazy = Lazy::new(|| { - Regex::new(r"\b([a-zA-Z_$][a-zA-Z0-9_$]*)\b").expect("valid regex") -}); - /// Parse a minified JavaScript bundle and extract declarations. pub fn parse_bundle(source: &str) -> Result> { if source.trim().is_empty() { @@ -61,11 +52,10 @@ pub fn parse_bundle(source: &str) -> Result> { Ok(decls) } -/// Extract top-level declarations from source using regex heuristics. +/// Extract top-level declarations from source using regex heuristics +/// combined with a single-pass metadata scanner. fn extract_declarations(source: &str) -> Vec { let mut declarations = Vec::new(); - - // Use HashSet for O(1) name lookups during cross-reference detection. let mut all_names: HashSet = HashSet::new(); // --- var/let/const --- @@ -128,7 +118,6 @@ fn extract_declarations(source: &str) -> Vec { // --- export declarations (ES modules) --- for cap in EXPORT_RE.captures_iter(source) { let name = cap[1].to_string(); - // Skip if already captured by var/fn/class regex. if all_names.contains(&name) { continue; } @@ -138,7 +127,7 @@ fn extract_declarations(source: &str) -> Vec { all_names.insert(name.clone()); declarations.push(Declaration { name, - kind: DeclKind::Const, // Treat exports as const by default. + kind: DeclKind::Const, byte_range: (match_start, body_end), string_literals: Vec::new(), property_accesses: Vec::new(), @@ -146,42 +135,32 @@ fn extract_declarations(source: &str) -> Vec { }); } - // Second pass: extract metadata for each declaration. + // Single-pass metadata extraction: scan each declaration's body ONCE + // to collect strings, properties, and identifiers simultaneously. for decl in &mut declarations { let (start, end) = decl.byte_range; let end = end.min(source.len()); let body = &source[start..end]; - // Extract string literals. - for cap in STRING_RE.captures_iter(body) { - let s = cap - .get(1) - .or_else(|| cap.get(2)) - .map(|m| m.as_str().to_string()) - .unwrap_or_default(); - if !s.is_empty() { - decl.string_literals.push(s); - } - } + let (strings, props, idents) = scan_body_single_pass(body); + decl.string_literals = strings; - // Extract property accesses (use HashSet for dedup). + // Deduplicate properties. let mut seen_props: HashSet = HashSet::new(); - for cap in PROP_RE.captures_iter(body) { - let prop = cap[1].to_string(); + for prop in props { if seen_props.insert(prop.clone()) { decl.property_accesses.push(prop); } } - // Extract cross-references to other declarations (use HashSet for dedup). + // Cross-references: identifiers that match other declaration names. let mut seen_refs: HashSet = HashSet::new(); - for cap in IDENT_RE.captures_iter(body) { - let ident = &cap[1]; + for ident in idents { if ident != decl.name - && all_names.contains(ident) - && seen_refs.insert(ident.to_string()) + && all_names.contains(&ident) + && seen_refs.insert(ident.clone()) { - decl.references.push(ident.to_string()); + decl.references.push(ident); } } } @@ -189,6 +168,107 @@ fn extract_declarations(source: &str) -> Vec { declarations } +/// Scan a declaration body in a SINGLE PASS to extract: +/// - String literals +/// - Property accesses (after '.') +/// - Identifiers (for cross-reference detection) +/// +/// This replaces three separate regex passes (STRING_RE, PROP_RE, IDENT_RE) +/// with one character-level scan, reducing time from O(3*n) to O(n). +fn scan_body_single_pass(body: &str) -> (Vec, Vec, Vec) { + let bytes = body.as_bytes(); + let len = bytes.len(); + let mut strings = Vec::new(); + let mut props = Vec::new(); + let mut idents = Vec::new(); + + let mut i = 0; + while i < len { + let ch = bytes[i]; + + // --- String literal --- + if ch == b'"' || ch == b'\'' { + let quote = ch; + i += 1; + let str_start = i; + while i < len { + if bytes[i] == b'\\' { + i += 2; // skip escape + continue; + } + if bytes[i] == quote { + break; + } + i += 1; + } + if i > str_start { + let s = String::from_utf8_lossy(&bytes[str_start..i]).to_string(); + if !s.is_empty() { + strings.push(s); + } + } + if i < len { + i += 1; // skip closing quote + } + continue; + } + + // --- Template literal (skip, don't parse contents as code) --- + if ch == b'`' { + i += 1; + while i < len { + if bytes[i] == b'\\' { + i += 2; + continue; + } + if bytes[i] == b'`' { + i += 1; + break; + } + i += 1; + } + continue; + } + + // --- Property access (after '.') --- + if ch == b'.' && i + 1 < len && is_ident_start(bytes[i + 1]) { + i += 1; + let prop_start = i; + while i < len && is_ident_char(bytes[i]) { + i += 1; + } + let prop = String::from_utf8_lossy(&bytes[prop_start..i]).to_string(); + props.push(prop); + continue; + } + + // --- Identifier --- + if is_ident_start(ch) { + let ident_start = i; + while i < len && is_ident_char(bytes[i]) { + i += 1; + } + let ident = String::from_utf8_lossy(&bytes[ident_start..i]).to_string(); + idents.push(ident); + continue; + } + + i += 1; + } + + (strings, props, idents) +} + +#[inline] +fn is_ident_start(b: u8) -> bool { + b.is_ascii_alphabetic() || b == b'_' || b == b'$' +} + +#[inline] +fn is_ident_char(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' || b == b'$' +} + /// Find the end of a declaration body by tracking brace depth, /// or falling back to the next semicolon at depth 0. fn find_declaration_end(source: &str, start: usize) -> usize { @@ -289,4 +369,16 @@ mod tests { let result = parse_bundle(""); assert!(result.is_err()); } + + #[test] + fn test_single_pass_scanner() { + let body = r#"function(){return"hello"+x.name+y}"#; + let (strings, props, idents) = scan_body_single_pass(body); + assert!(strings.contains(&"hello".to_string())); + assert!(props.contains(&"name".to_string())); + assert!(idents.contains(&"function".to_string())); + assert!(idents.contains(&"return".to_string())); + assert!(idents.contains(&"x".to_string())); + assert!(idents.contains(&"y".to_string())); + } } diff --git a/crates/ruvector-decompiler/src/partitioner.rs b/crates/ruvector-decompiler/src/partitioner.rs index 36c1d551..548a077e 100644 --- a/crates/ruvector-decompiler/src/partitioner.rs +++ b/crates/ruvector-decompiler/src/partitioner.rs @@ -1,7 +1,10 @@ -//! MinCut-based module boundary detection. +//! Module boundary detection with adaptive partitioning. //! -//! Uses `ruvector-mincut`'s `GraphPartitioner` to split the reference graph -//! into partitions, each representing a reconstructed module. +//! Uses exact MinCut for small graphs (<5K nodes) and Louvain community +//! detection for large graphs (>=5K nodes). Louvain is O(n log n) and +//! handles 100K+ node graphs in seconds. + +use std::collections::HashMap; use crate::error::{DecompilerError, Result}; use crate::graph::ReferenceGraph; @@ -9,11 +12,14 @@ use crate::types::{Declaration, Module}; use ruvector_mincut::GraphPartitioner; -/// Partition the reference graph into modules using MinCut bisection. +/// Partition the reference graph into modules. +/// +/// Automatically selects the partitioning algorithm based on graph size: +/// - <5000 nodes: exact MinCut via `ruvector-mincut::GraphPartitioner` +/// - >=5000 nodes: Louvain community detection (approximate, O(n log n)) /// /// If `target_modules` is `None`, the partition count is estimated from -/// the graph structure (heuristic: one module per 3--5 loosely connected -/// declarations, minimum 2). +/// the graph structure. pub fn partition_modules( graph: &ReferenceGraph, target_modules: Option, @@ -25,25 +31,34 @@ pub fn partition_modules( )); } - // Determine target partition count. let target = target_modules.unwrap_or_else(|| estimate_module_count(graph)); let target = target.clamp(1, n); if target == 1 || n <= 2 { - // Everything in one module. return Ok(vec![build_module( 0, &graph.declarations, - &graph.declarations, )]); } - // Use MinCut GraphPartitioner for recursive bisection. + // Choose algorithm based on graph size. + if n >= 5000 { + louvain_partition(graph, target) + } else { + exact_mincut_partition(graph, target) + } +} + +/// Exact MinCut partitioning for small-to-medium graphs (<5K nodes). +fn exact_mincut_partition( + graph: &ReferenceGraph, + target: usize, +) -> Result> { let partitioner = GraphPartitioner::new(graph.graph.clone(), target); let partitions = partitioner.partition(); - // Track which declarations were assigned by the partitioner. - let mut assigned: std::collections::HashSet = std::collections::HashSet::new(); + let mut assigned: std::collections::HashSet = + std::collections::HashSet::new(); let mut modules = Vec::new(); let mut mod_idx = 0; @@ -63,14 +78,206 @@ pub fn partition_modules( } if !decls.is_empty() { - modules.push(build_module(mod_idx, &decls, &graph.declarations)); + modules.push(build_module(mod_idx, &decls)); mod_idx += 1; } } - // Collect declarations not assigned by the partitioner (isolated nodes - // with no edges in the reference graph). Distribute them round-robin - // across existing modules, or create a new module if none exist. + distribute_orphans(graph, &mut modules, &assigned); + finalize_modules(graph, modules) +} + +/// Louvain community detection for large graphs (>=5K nodes). +/// +/// O(n log n) -- handles 100K+ node graphs in seconds. +/// +/// Algorithm: +/// 1. Start with each node in its own community. +/// 2. Repeatedly move nodes to the neighbor community that maximizes +/// modularity gain. +/// 3. When no more single-node moves improve modularity, aggregate +/// communities into super-nodes and repeat. +/// 4. Merge small communities to meet target count if needed. +fn louvain_partition( + graph: &ReferenceGraph, + target: usize, +) -> Result> { + let n = graph.node_count(); + + // Build adjacency list from the reference graph. + let mut adj: Vec> = vec![Vec::new(); n]; + let mut total_weight = 0.0; + + for (i, decl) in graph.declarations.iter().enumerate() { + for ref_name in &decl.references { + if let Some(&vid) = graph.name_to_vertex.get(ref_name) { + if let Some(&j) = graph.vertex_to_decl.get(&vid) { + if i != j { + adj[i].push((j, 1.0)); + total_weight += 1.0; + } + } + } + } + } + + // If no edges, fall back to positional grouping. + if total_weight < 1.0 { + return positional_partition(graph, target); + } + + // Node weights: sum of edge weights for each node. + let node_weights: Vec = adj + .iter() + .map(|neighbors| neighbors.iter().map(|(_, w)| w).sum::()) + .collect(); + + // Phase 1: Local moves -- assign each node to its own community, + // then iteratively move nodes to improve modularity. + let mut community: Vec = (0..n).collect(); + let m2 = total_weight; // sum of all edge weights (each counted once) + + let mut improved = true; + let mut iterations = 0; + let max_iterations = 20; // Prevent infinite loops + + while improved && iterations < max_iterations { + improved = false; + iterations += 1; + + for i in 0..n { + let current_comm = community[i]; + let ki = node_weights[i]; + + // Compute sum of weights to each neighbor community. + let mut comm_weights: HashMap = HashMap::new(); + for &(j, w) in &adj[i] { + *comm_weights.entry(community[j]).or_insert(0.0) += w; + } + + // Compute sum of node weights in each candidate community. + // For efficiency, use a running tally (approximate for large n). + let mut best_comm = current_comm; + let mut best_gain = 0.0f64; + + // Weight of current community edges for node i. + let ki_in_current = comm_weights.get(¤t_comm).copied().unwrap_or(0.0); + + // Approximate community total weight (sum of node_weights for + // all nodes in community). For speed, compute only for neighbors. + let sigma_current = community_total_weight( + &community, current_comm, &node_weights, + ); + + for (&candidate_comm, &ki_in_candidate) in &comm_weights { + if candidate_comm == current_comm { + continue; + } + + let sigma_candidate = community_total_weight( + &community, candidate_comm, &node_weights, + ); + + // Modularity gain of moving i from current to candidate: + // dQ = [ki_in_candidate - sigma_candidate * ki / m] + // - [ki_in_current - (sigma_current - ki) * ki / m] + let gain = (ki_in_candidate - ki_in_current) + - ki * (sigma_candidate - sigma_current + ki) / m2; + + if gain > best_gain { + best_gain = gain; + best_comm = candidate_comm; + } + } + + if best_comm != current_comm { + community[i] = best_comm; + improved = true; + } + } + } + + // Phase 2: Collect communities and merge small ones to meet target. + let mut comm_members: HashMap> = HashMap::new(); + for (i, &c) in community.iter().enumerate() { + comm_members.entry(c).or_default().push(i); + } + + let mut communities: Vec> = comm_members.into_values().collect(); + + // Sort by size (largest first) for stable merging. + communities.sort_by(|a, b| b.len().cmp(&a.len())); + + // Merge small communities if we have too many. + while communities.len() > target && communities.len() > 1 { + // Merge the two smallest communities. + let small = communities.pop().unwrap(); + if let Some(last) = communities.last_mut() { + last.extend(small); + } + } + + // Build modules from communities. + let mut modules = Vec::new(); + for (mod_idx, members) in communities.iter().enumerate() { + let decls: Vec = members + .iter() + .filter_map(|&i| graph.declarations.get(i).cloned()) + .collect(); + if !decls.is_empty() { + modules.push(build_module(mod_idx, &decls)); + } + } + + finalize_modules(graph, modules) +} + +/// Compute total node weight for a community (used in modularity gain). +/// For performance, caps iteration at 1000 nodes per community. +fn community_total_weight( + community: &[usize], + comm_id: usize, + node_weights: &[f64], +) -> f64 { + let mut total = 0.0; + let mut count = 0; + for (i, &c) in community.iter().enumerate() { + if c == comm_id { + total += node_weights[i]; + count += 1; + if count >= 1000 { + // Approximate: scale up for very large communities. + let remaining = community.iter().filter(|&&cc| cc == comm_id).count(); + return total * (remaining as f64 / count as f64); + } + } + } + total +} + +/// Fallback: positional partitioning by byte offset for edge-less graphs. +fn positional_partition( + graph: &ReferenceGraph, + target: usize, +) -> Result> { + let n = graph.node_count(); + let chunk_size = (n + target - 1) / target; + + let mut modules = Vec::new(); + for (mod_idx, chunk) in graph.declarations.chunks(chunk_size).enumerate() { + modules.push(build_module(mod_idx, chunk)); + } + + finalize_modules(graph, modules) +} + +/// Distribute orphan declarations (not assigned by partitioner) to +/// nearest modules by byte position. +fn distribute_orphans( + graph: &ReferenceGraph, + modules: &mut Vec, + assigned: &std::collections::HashSet, +) { let orphans: Vec = graph .declarations .iter() @@ -79,52 +286,47 @@ pub fn partition_modules( .map(|(_, d)| d.clone()) .collect(); - if !orphans.is_empty() { - if modules.is_empty() { - // No modules at all: put everything in one. - modules.push(build_module(0, &orphans, &graph.declarations)); - } else { - // Distribute orphans by proximity (byte position). - for orphan in &orphans { - let best_module = modules - .iter_mut() - .min_by_key(|m| { - let mid = (m.byte_range.0 + m.byte_range.1) / 2; - let orphan_mid = (orphan.byte_range.0 + orphan.byte_range.1) / 2; - (mid as i64 - orphan_mid as i64).unsigned_abs() - }) - .unwrap(); - best_module.declarations.push(orphan.clone()); - // Update byte range. - best_module.byte_range.0 = - best_module.byte_range.0.min(orphan.byte_range.0); - best_module.byte_range.1 = - best_module.byte_range.1.max(orphan.byte_range.1); - } + if orphans.is_empty() { + return; + } + + if modules.is_empty() { + modules.push(build_module(0, &orphans)); + } else { + for orphan in &orphans { + let best_module = modules + .iter_mut() + .min_by_key(|m| { + let mid = (m.byte_range.0 + m.byte_range.1) / 2; + let orphan_mid = + (orphan.byte_range.0 + orphan.byte_range.1) / 2; + (mid as i64 - orphan_mid as i64).unsigned_abs() + }) + .unwrap(); + best_module.declarations.push(orphan.clone()); + best_module.byte_range.0 = + best_module.byte_range.0.min(orphan.byte_range.0); + best_module.byte_range.1 = + best_module.byte_range.1.max(orphan.byte_range.1); } } +} - // Fall back if everything somehow ended up empty. +/// Finalize module list: ensure at least one module exists. +fn finalize_modules( + graph: &ReferenceGraph, + modules: Vec, +) -> Result> { if modules.is_empty() { - return Ok(vec![build_module( - 0, - &graph.declarations, - &graph.declarations, - )]); + Ok(vec![build_module(0, &graph.declarations)]) + } else { + Ok(modules) } - - Ok(modules) } /// Build a `Module` from a set of declarations. -fn build_module( - index: usize, - decls: &[Declaration], - _all_decls: &[Declaration], -) -> Module { +fn build_module(index: usize, decls: &[Declaration]) -> Module { let name = infer_module_name(decls, index); - - // Compute the byte range spanning all declarations in this module. let start = decls.iter().map(|d| d.byte_range.0).min().unwrap_or(0); let end = decls.iter().map(|d| d.byte_range.1).max().unwrap_or(0); @@ -139,12 +341,10 @@ fn build_module( /// Infer a module name from the dominant string literals and property names. fn infer_module_name(decls: &[Declaration], fallback_index: usize) -> String { - // Collect all string literals across declarations in this module. let mut candidates: Vec<&str> = Vec::new(); for decl in decls { for s in &decl.string_literals { - // Prefer short, path-like or keyword-like strings. if s.len() >= 2 && s.len() <= 40 && !s.contains(' ') { candidates.push(s.as_str()); } @@ -154,9 +354,8 @@ fn infer_module_name(decls: &[Declaration], fallback_index: usize) -> String { } } - // Pick the most common non-trivial candidate. if !candidates.is_empty() { - let mut freq: std::collections::HashMap<&str, usize> = std::collections::HashMap::new(); + let mut freq: HashMap<&str, usize> = HashMap::new(); for c in &candidates { *freq.entry(c).or_insert(0) += 1; } @@ -190,14 +389,15 @@ fn estimate_module_count(graph: &ReferenceGraph) -> usize { return 1; } - // Heuristic: modules ~ n / avg_degree, clamped to reasonable range. - let avg_degree = if n > 0 { (2 * e) as f64 / n as f64 } else { 0.0 }; + let avg_degree = if n > 0 { + (2 * e) as f64 / n as f64 + } else { + 0.0 + }; if avg_degree < 1.0 { - // Very sparse: likely many independent modules. (n / 2).max(2) } else { - // Moderate coupling: fewer modules. (n as f64 / (avg_degree + 1.0)).ceil().max(2.0) as usize } } @@ -238,7 +438,6 @@ mod tests { let graph = build_reference_graph(decls); let modules = partition_modules(&graph, Some(2)).unwrap(); assert!(!modules.is_empty()); - // Total declarations across all modules should equal 4. let total: usize = modules.iter().map(|m| m.declarations.len()).sum(); assert_eq!(total, 4); } @@ -249,4 +448,39 @@ mod tests { let name = infer_module_name(&decls, 0); assert_eq!(name, "auth"); } + + #[test] + fn test_louvain_large_graph() { + // Create a graph with 100 nodes in two clusters. + let mut decls = Vec::new(); + for i in 0..50 { + let refs: Vec<&str> = Vec::new(); + decls.push(make_decl( + &format!("a{}", i), + &refs, + &["cluster_a"], + )); + } + for i in 0..50 { + decls.push(make_decl( + &format!("b{}", i), + &[], + &["cluster_b"], + )); + } + // Add cross-references within clusters. + for i in 1..50 { + decls[i].references.push(format!("a{}", i - 1)); + } + for i in 51..100 { + decls[i].references.push(format!("b{}", i - 51)); + } + + let graph = build_reference_graph(decls); + // Force louvain by calling it directly. + let modules = louvain_partition(&graph, 2).unwrap(); + assert!(!modules.is_empty()); + let total: usize = modules.iter().map(|m| m.declarations.len()).sum(); + assert_eq!(total, 100); + } } diff --git a/crates/ruvector-decompiler/src/training.rs b/crates/ruvector-decompiler/src/training.rs new file mode 100644 index 00000000..18effa7a --- /dev/null +++ b/crates/ruvector-decompiler/src/training.rs @@ -0,0 +1,233 @@ +//! Training corpus for domain-specific name inference. +//! +//! Loads patterns from JSON data files (e.g., Claude Code patterns) +//! and matches declarations against them for high-quality name inference. + +use std::collections::HashSet; + +use crate::types::Declaration; + +/// A training pattern mapping context signals to a known name. +#[derive(Debug, Clone)] +pub struct TrainingPattern { + /// String literals that appear near the declaration. + pub context_strings: Vec, + /// Property names accessed on the declaration. + pub property_names: Vec, + /// The inferred human-readable name. + pub inferred_name: String, + /// Optional module classification hint. + pub module_hint: Option, + /// Confidence score (0.0 to 1.0). + pub confidence: f64, +} + +/// A corpus of training patterns for domain-specific inference. +#[derive(Debug, Clone)] +pub struct TrainingCorpus { + pub patterns: Vec, +} + +impl TrainingCorpus { + /// Create an empty corpus. + pub fn new() -> Self { + Self { + patterns: Vec::new(), + } + } + + /// Load training data from a JSON string. + /// + /// Expected format: array of objects with fields: + /// - `context_strings`: `[String]` + /// - `property_names`: `[String]` + /// - `inferred_name`: `String` + /// - `module_hint`: `String` (optional) + /// - `confidence`: `f64` + pub fn from_json(json: &str) -> Result { + let raw: Vec = serde_json::from_str(json)?; + let patterns = raw + .into_iter() + .map(|r| TrainingPattern { + context_strings: r.context_strings, + property_names: r.property_names, + inferred_name: r.inferred_name, + module_hint: r.module_hint, + confidence: r.confidence, + }) + .collect(); + Ok(Self { patterns }) + } + + /// Load the built-in Claude Code patterns. + pub fn builtin() -> Self { + let json = include_str!("../data/claude-code-patterns.json"); + Self::from_json(json).unwrap_or_else(|_| Self::new()) + } + + /// Match a declaration against the training corpus. + /// + /// Returns the best-matching pattern with a computed match score. + /// Requires at least one context string or property name match. + pub fn match_declaration( + &self, + decl: &Declaration, + ) -> Option<(&TrainingPattern, f64)> { + let decl_strings: HashSet<&str> = decl + .string_literals + .iter() + .map(|s| s.as_str()) + .collect(); + let decl_props: HashSet<&str> = decl + .property_accesses + .iter() + .map(|s| s.as_str()) + .collect(); + + let mut best: Option<(&TrainingPattern, f64)> = None; + + for pattern in &self.patterns { + // Count context string matches (substring matching). + let string_matches: usize = pattern + .context_strings + .iter() + .filter(|cs| { + decl_strings.iter().any(|ds| ds.contains(cs.as_str())) + || decl + .string_literals + .iter() + .any(|lit| lit.contains(cs.as_str())) + }) + .count(); + + // Count property name matches (exact). + let prop_matches: usize = pattern + .property_names + .iter() + .filter(|pn| decl_props.contains(pn.as_str())) + .count(); + + let total_signals = + pattern.context_strings.len() + pattern.property_names.len(); + if total_signals == 0 { + continue; + } + + let match_ratio = + (string_matches + prop_matches) as f64 / total_signals as f64; + + // Require at least one match to consider this pattern. + if string_matches + prop_matches == 0 { + continue; + } + + // Weighted score: match_ratio * pattern confidence. + let score = match_ratio * pattern.confidence; + + if let Some((_, best_score)) = best { + if score > best_score { + best = Some((pattern, score)); + } + } else { + best = Some((pattern, score)); + } + } + + // Only return if the score is meaningful (>= 0.3). + best.filter(|(_, score)| *score >= 0.3) + } +} + +#[derive(serde::Deserialize)] +struct RawPattern { + context_strings: Vec, + property_names: Vec, + inferred_name: String, + module_hint: Option, + confidence: f64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::DeclKind; + + fn make_decl( + name: &str, + strings: &[&str], + props: &[&str], + ) -> Declaration { + Declaration { + name: name.to_string(), + kind: DeclKind::Var, + byte_range: (0, 10), + string_literals: strings.iter().map(|s| s.to_string()).collect(), + property_accesses: props.iter().map(|s| s.to_string()).collect(), + references: vec![], + } + } + + #[test] + fn test_training_corpus_from_json() { + let json = r#"[ + { + "context_strings": ["test_pattern"], + "property_names": [], + "inferred_name": "TestHandler", + "module_hint": null, + "confidence": 0.95 + } + ]"#; + let corpus = TrainingCorpus::from_json(json).unwrap(); + assert_eq!(corpus.patterns.len(), 1); + assert_eq!(corpus.patterns[0].inferred_name, "TestHandler"); + } + + #[test] + fn test_builtin_corpus_loads() { + let corpus = TrainingCorpus::builtin(); + assert!( + corpus.patterns.len() >= 40, + "Expected at least 40 builtin patterns, got {}", + corpus.patterns.len() + ); + } + + #[test] + fn test_corpus_match_mcp() { + let decl = make_decl( + "x", + &["protocolVersion", "serverInfo", "capabilities"], + &["protocolVersion", "serverInfo"], + ); + let corpus = TrainingCorpus::builtin(); + let result = corpus.match_declaration(&decl); + assert!(result.is_some()); + let (pattern, score) = result.unwrap(); + assert!( + pattern.inferred_name.contains("Mcp") + || pattern.inferred_name.contains("Protocol"), + "Expected MCP-related name, got: {}", + pattern.inferred_name + ); + assert!(score > 0.3); + } + + #[test] + fn test_corpus_match_tool_definitions() { + let decl = make_decl( + "y", + &["Bash", "Read", "Edit", "Write"], + &["description", "inputSchema"], + ); + let corpus = TrainingCorpus::builtin(); + let result = corpus.match_declaration(&decl); + assert!(result.is_some()); + let (pattern, _) = result.unwrap(); + assert!( + pattern.inferred_name.contains("Tool"), + "Expected Tool-related name, got: {}", + pattern.inferred_name + ); + } +}