diff --git a/crates/ruvllm/src/moe/affinity.rs b/crates/ruvllm/src/moe/affinity.rs index 81440766..f8d24758 100644 --- a/crates/ruvllm/src/moe/affinity.rs +++ b/crates/ruvllm/src/moe/affinity.rs @@ -238,11 +238,19 @@ impl ExpertAffinity { /// Get experts sorted by affinity score (highest first) /// - /// Useful for prefetching decisions. + /// Useful for prefetching decisions. NaN values are treated as lowest priority. pub fn top_k_by_affinity(&self, k: usize) -> Vec { - let mut indexed: Vec<(ExpertId, f32)> = - self.scores.iter().copied().enumerate().collect(); - indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + let mut indexed: Vec<(ExpertId, f32)> = self + .scores + .iter() + .enumerate() + .map(|(id, &s)| (id, if s.is_finite() { s } else { f32::NEG_INFINITY })) + .collect(); + indexed.sort_by(|a, b| { + b.1.partial_cmp(&a.1) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.0.cmp(&b.0)) // Deterministic tie-breaking by ID + }); indexed.into_iter().take(k).map(|(id, _)| id).collect() } @@ -258,7 +266,7 @@ impl ExpertAffinity { /// Get the least-affinity expert from a set of candidates /// - /// Useful for eviction decisions. + /// Useful for eviction decisions. NaN values are treated as lowest (evict first). pub fn least_affinity(&self, candidates: &[ExpertId]) -> Option { candidates .iter() @@ -266,7 +274,12 @@ impl ExpertAffinity { .min_by(|&a, &b| { let score_a = self.score(a); let score_b = self.score(b); - score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal) + // NaN handling: treat NaN as NEG_INFINITY for eviction priority + let sa = if score_a.is_finite() { score_a } else { f32::NEG_INFINITY }; + let sb = if score_b.is_finite() { score_b } else { f32::NEG_INFINITY }; + sa.partial_cmp(&sb) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.cmp(&b)) // Deterministic tie-breaking }) } diff --git a/crates/ruvllm/src/moe/precision_allocator.rs b/crates/ruvllm/src/moe/precision_allocator.rs index 629bec0b..d2d681a1 100644 --- a/crates/ruvllm/src/moe/precision_allocator.rs +++ b/crates/ruvllm/src/moe/precision_allocator.rs @@ -23,7 +23,7 @@ //! use ruvllm::gguf::GgufQuantType; //! //! let config = PrecisionConfig::default(); -//! let mut allocator = PrecisionAllocator::new(8, config); +//! let mut allocator = PrecisionAllocator::new(8, config).unwrap(); //! //! // Record activations as experts are used //! allocator.record_activation(2); @@ -201,7 +201,7 @@ impl PrecisionConfig { /// use ruvllm::moe::precision_allocator::{PrecisionAllocator, PrecisionConfig}; /// /// let config = PrecisionConfig::default(); -/// let mut allocator = PrecisionAllocator::new(8, config); +/// let mut allocator = PrecisionAllocator::new(8, config).unwrap(); /// /// // Simulate expert activations /// for _ in 0..100 { allocator.record_activation(0); } // Hot @@ -243,21 +243,28 @@ impl PrecisionAllocator { /// * `num_experts` - Total number of experts to track. /// * `config` - Configuration for precision allocation. /// - /// # Panics + /// # Returns /// - /// Panics if the configuration is invalid. - pub fn new(num_experts: usize, config: PrecisionConfig) -> Self { - config - .validate() - .expect("PrecisionConfig validation failed"); + /// Returns `Err` if the configuration is invalid. + pub fn new(num_experts: usize, config: PrecisionConfig) -> Result { + config.validate()?; - Self { + Ok(Self { num_experts, counts: vec![0; num_experts], config, hot_threshold: 0, cold_threshold: 0, - } + }) + } + + /// Create a new precision allocator, panicking on invalid config. + /// + /// # Panics + /// + /// Panics if the configuration is invalid. + pub fn new_unchecked(num_experts: usize, config: PrecisionConfig) -> Self { + Self::new(num_experts, config).expect("PrecisionConfig validation failed") } /// Record an activation for the given expert. @@ -513,7 +520,7 @@ mod tests { #[test] fn test_allocator_creation() { let config = PrecisionConfig::default(); - let allocator = PrecisionAllocator::new(8, config); + let allocator = PrecisionAllocator::new(8, config).unwrap(); assert_eq!(allocator.num_experts(), 8); assert_eq!(allocator.total_activations(), 0); @@ -537,7 +544,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(8, config); + let mut allocator = PrecisionAllocator::new(8, config).unwrap(); // Expert 0 gets 100 activations (max) for _ in 0..100 { @@ -569,7 +576,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(8, config); + let mut allocator = PrecisionAllocator::new(8, config).unwrap(); // Expert 0 gets 100 activations (max) for _ in 0..100 { @@ -599,7 +606,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(8, config); + let mut allocator = PrecisionAllocator::new(8, config).unwrap(); // Expert 0 gets 100 activations (max) for _ in 0..100 { @@ -629,7 +636,7 @@ mod tests { cold_percentile: 0.2, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); // Set up activation counts: 100, 75, 25, 5 for _ in 0..100 { @@ -677,7 +684,7 @@ mod tests { #[test] fn test_activation_recording() { let config = PrecisionConfig::default(); - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); // Record individual activations allocator.record_activation(0); @@ -715,7 +722,7 @@ mod tests { warm_format: GgufQuantType::Q4_K, cold_format: GgufQuantType::Q3_K, }; - let mut allocator = PrecisionAllocator::new(3, config); + let mut allocator = PrecisionAllocator::new(3, config).unwrap(); // Set up: 100 (hot), 50 (warm), 10 (cold) for _ in 0..100 { @@ -746,7 +753,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); // Initially all zeros allocator.recompute_thresholds(); @@ -787,7 +794,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); for _ in 0..100 { allocator.record_activation(0); @@ -823,7 +830,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(8, config); + let mut allocator = PrecisionAllocator::new(8, config).unwrap(); // 2 hot, 3 warm, 3 cold for _ in 0..100 { @@ -865,7 +872,7 @@ mod tests { #[test] fn test_reset() { let config = PrecisionConfig::default(); - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); // Add activations for _ in 0..100 { @@ -898,7 +905,7 @@ mod tests { cold_percentile: 0.3, ..Default::default() }; - let mut allocator = PrecisionAllocator::new(6, config); + let mut allocator = PrecisionAllocator::new(6, config).unwrap(); // Set up known distribution for _ in 0..100 { @@ -937,7 +944,7 @@ mod tests { #[test] fn test_compute_percentile() { let config = PrecisionConfig::default(); - let mut allocator = PrecisionAllocator::new(4, config); + let mut allocator = PrecisionAllocator::new(4, config).unwrap(); // No activations -> 0.0 percentile assert_eq!(allocator.compute_percentile(0), 0.0); @@ -1020,7 +1027,7 @@ mod tests { #[test] fn test_out_of_bounds_expert_id() { let config = PrecisionConfig::default(); - let allocator = PrecisionAllocator::new(4, config); + let allocator = PrecisionAllocator::new(4, config).unwrap(); // Out-of-bounds should return Cold assert_eq!(allocator.allocate(100), ExpertPrecision::Cold); @@ -1068,7 +1075,7 @@ mod tests { #[test] fn test_saturating_add_for_counts() { let config = PrecisionConfig::default(); - let mut allocator = PrecisionAllocator::new(1, config); + let mut allocator = PrecisionAllocator::new(1, config).unwrap(); // Set count close to max allocator.counts[0] = u64::MAX - 1; diff --git a/crates/ruvllm/src/moe/router.rs b/crates/ruvllm/src/moe/router.rs index 1aaba4b7..557aac0c 100644 --- a/crates/ruvllm/src/moe/router.rs +++ b/crates/ruvllm/src/moe/router.rs @@ -216,22 +216,26 @@ impl MemoryAwareRouter { /// * `config` - Router configuration /// * `affinity` - Expert affinity tracker (can be shared) /// - /// # Panics + /// # Returns /// - /// Panics if the configuration is invalid. - pub fn new(config: RouterConfig, affinity: ExpertAffinity) -> Self { - config.validate().expect("RouterConfig validation failed"); + /// Returns `Err` if the configuration is invalid. + pub fn new(config: RouterConfig, affinity: ExpertAffinity) -> Result { + config.validate()?; - Self { + Ok(Self { cache_resident: vec![false; config.num_experts], config, affinity, metrics: MoeMetrics::new(), - } + }) } /// Create router with default affinity tracker - pub fn with_default_affinity(config: RouterConfig) -> Self { + /// + /// # Returns + /// + /// Returns `Err` if the configuration is invalid. + pub fn with_default_affinity(config: RouterConfig) -> Result { let affinity = ExpertAffinity::new( super::AffinityConfig::with_num_experts(config.num_experts) ); @@ -294,45 +298,88 @@ impl MemoryAwareRouter { (selected, paging_requests) } - /// Apply cache residency bonus to scores + /// Apply cache residency bonus to scores (in-place mutation for P0 optimization) + /// + /// For each expert currently in cache, adds `cache_bonus` to its score. + /// This biases the selection toward cached experts without completely + /// overriding the gate network's decisions. + /// + /// # Arguments + /// + /// * `scores` - Mutable slice of scores to modify in-place + pub fn apply_cache_bonus_inplace(&self, scores: &mut [f32]) { + for (id, score) in scores.iter_mut().enumerate() { + // Validate score is not NaN/Inf before processing + if !score.is_finite() { + *score = 0.0; + continue; + } + if self.cache_resident.get(id).copied().unwrap_or(false) { + *score += self.config.cache_bonus; + } + } + } + + /// Apply cache residency bonus to scores (allocating version for API compatibility) /// /// For each expert currently in cache, adds `cache_bonus` to its score. /// This biases the selection toward cached experts without completely /// overriding the gate network's decisions. pub fn apply_cache_bonus(&self, scores: &[f32]) -> Vec { - scores - .iter() - .enumerate() - .map(|(id, &score)| { - let bonus = if self.cache_resident.get(id).copied().unwrap_or(false) { - self.config.cache_bonus - } else { - 0.0 - }; - score + bonus - }) - .collect() + let mut result = scores.to_vec(); + self.apply_cache_bonus_inplace(&mut result); + result } /// Select top-K experts by score /// /// Returns expert IDs sorted by descending score. /// Ties are broken by expert ID (lower ID wins) for determinism. + /// + /// Uses partial sort (P0 optimization) for better performance when + /// top_k << num_experts. pub fn select_top_k(&self, scores: &[f32]) -> Vec { - // Create indexed scores - let mut indexed: Vec<(ExpertId, f32)> = scores.iter().copied().enumerate().collect(); + let n = scores.len(); + let k = self.config.top_k.min(n); - // Sort by score descending, then by ID ascending (for determinism) - indexed.sort_by(|a, b| { - b.1.partial_cmp(&a.1) - .unwrap_or(std::cmp::Ordering::Equal) - .then_with(|| a.0.cmp(&b.0)) - }); + if k == 0 || n == 0 { + return Vec::new(); + } + + // Create indexed scores, handling NaN/Inf values + let mut indexed: Vec<(ExpertId, f32)> = scores + .iter() + .enumerate() + .map(|(id, &s)| (id, if s.is_finite() { s } else { f32::NEG_INFINITY })) + .collect(); + + // Use partial sort for better performance when k << n + if k < n / 2 { + // Partition to get top-k elements (unordered) + indexed.select_nth_unstable_by(k - 1, |a, b| { + b.1.partial_cmp(&a.1) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.0.cmp(&b.0)) + }); + // Sort only the top-k portion + indexed[..k].sort_by(|a, b| { + b.1.partial_cmp(&a.1) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.0.cmp(&b.0)) + }); + } else { + // Full sort when k is close to n + indexed.sort_by(|a, b| { + b.1.partial_cmp(&a.1) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.0.cmp(&b.0)) + }); + } // Take top-K indexed .into_iter() - .take(self.config.top_k) + .take(k) .map(|(id, _)| id) .collect() } @@ -457,7 +504,7 @@ mod tests { fn make_router(num_experts: usize, top_k: usize, cache_bonus: f32) -> MemoryAwareRouter { let config = RouterConfig::new(num_experts, top_k).with_cache_bonus(cache_bonus); - MemoryAwareRouter::with_default_affinity(config) + MemoryAwareRouter::with_default_affinity(config).expect("test config should be valid") } // --------------------------------------------------------------- @@ -688,7 +735,7 @@ mod tests { #[test] fn test_memory_aware_disabled() { let config = RouterConfig::new(4, 2).with_memory_aware(false).with_cache_bonus(0.5); - let mut router = MemoryAwareRouter::with_default_affinity(config); + let mut router = MemoryAwareRouter::with_default_affinity(config).unwrap(); // Even with high cache bonus, should not apply it when disabled router.update_cache_state(&[3]); // Expert 3 resident @@ -734,7 +781,7 @@ mod tests { let config = RouterConfig::new(4, 2).with_cache_bonus(0.0); let affinity_config = AffinityConfig::with_num_experts(4).with_decay(1.0); let affinity = ExpertAffinity::new(affinity_config); - let mut router = MemoryAwareRouter::new(config, affinity); + let mut router = MemoryAwareRouter::new(config, affinity).unwrap(); // Build affinity let gate_logits = vec![0.4, 0.3, 0.5, 0.2]; diff --git a/crates/ruvllm/src/moe/sram_mapper.rs b/crates/ruvllm/src/moe/sram_mapper.rs index 9a00cbcc..678969fb 100644 --- a/crates/ruvllm/src/moe/sram_mapper.rs +++ b/crates/ruvllm/src/moe/sram_mapper.rs @@ -156,7 +156,14 @@ impl SramExpertAffinity { // - Recency is important for temporal locality // - Router weight indicates model preference let freq_factor = (self.access_count as f32 + 1.0).ln(); - let recency_factor = 1.0 / (1.0 + (self.last_access as f32).recip() * 0.001); + + // Guard against division by zero when last_access is 0 + let recency_factor = if self.last_access == 0 { + 0.0 + } else { + 1.0 / (1.0 + 0.001 / self.last_access as f32) + }; + let weight_factor = self.avg_router_weight * 2.0; freq_factor + recency_factor + weight_factor @@ -466,11 +473,13 @@ impl SramMapper { /// * `expert_id` - Expert to assign /// * `tier` - Target memory tier /// - /// # Panics + /// # Returns /// - /// Panics if `expert_id >= num_experts`. - pub fn assign_tier(&mut self, expert_id: ExpertId, tier: MemoryTier) { - assert!(expert_id < self.num_experts, "Expert ID out of range"); + /// Returns `false` if `expert_id >= num_experts`, `true` otherwise. + pub fn assign_tier(&mut self, expert_id: ExpertId, tier: MemoryTier) -> bool { + if expert_id >= self.num_experts { + return false; + } let old_tier = self.tier_map[expert_id]; @@ -497,6 +506,7 @@ impl SramMapper { } self.tier_map[expert_id] = tier; + true } /// Get the current memory tier for an expert.