mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-24 22:15:18 +00:00
feat(core): add DeterministicScore type with deterministic fusion
Port fixed-point i64 scoring (2^32 scale) from khive-score into ruvector-core. Fixes NaN panics in FlatIndex, changes RRF default K from 60 to 15 (per Lean4 consensus bias proof), and adds search_deterministic() to VectorIndex trait. Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
5126ba418f
commit
63d3ff6535
6 changed files with 653 additions and 6 deletions
|
|
@ -266,7 +266,8 @@ pub struct ScoredDoc {
|
|||
/// Strategy for combining ranked lists from different retrieval systems.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
pub enum FusionStrategy {
|
||||
/// Reciprocal Rank Fusion. `k` controls rank-pressure (default 60).
|
||||
/// Reciprocal Rank Fusion. `k` controls rank-pressure (default 15).
|
||||
/// K=15 per Lean4 proof `ret010_k60_consensus_bias` — K=60 has consensus bias.
|
||||
RRF { k: f32 },
|
||||
/// Weighted linear combination of normalised scores.
|
||||
Linear {
|
||||
|
|
@ -280,7 +281,7 @@ pub enum FusionStrategy {
|
|||
|
||||
impl Default for FusionStrategy {
|
||||
fn default() -> Self {
|
||||
FusionStrategy::RRF { k: 60.0 }
|
||||
FusionStrategy::RRF { k: 15.0 }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
612
crates/ruvector-core/src/deterministic_score.rs
Normal file
612
crates/ruvector-core/src/deterministic_score.rs
Normal file
|
|
@ -0,0 +1,612 @@
|
|||
//! Fixed-point integer scoring for cross-platform deterministic ranking.
|
||||
//!
|
||||
//! Converts f64 to i64 with 2^32 scaling. NaN → ZERO, +Inf → MAX, -Inf → NEG_INF.
|
||||
//! All arithmetic uses i128 intermediates with saturating semantics.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::ops::{Add, Div, Mul, Sub};
|
||||
|
||||
use crate::types::{DistanceMetric, SearchResult, VectorId};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DeterministicScore
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
|
||||
#[repr(transparent)]
|
||||
pub struct DeterministicScore(i64);
|
||||
|
||||
impl DeterministicScore {
|
||||
const SCALE: f64 = 4_294_967_296.0; // 2^32
|
||||
|
||||
pub const MAX: Self = Self(i64::MAX);
|
||||
pub const NEG_INF: Self = Self(i64::MIN + 1);
|
||||
pub const ZERO: Self = Self(0);
|
||||
|
||||
#[inline]
|
||||
pub const fn from_raw(raw: i64) -> Self {
|
||||
Self(raw)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub const fn to_raw(self) -> i64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_f64(val: f64) -> Self {
|
||||
if val.is_nan() {
|
||||
return Self::ZERO;
|
||||
}
|
||||
if val.is_infinite() {
|
||||
return if val.is_sign_positive() {
|
||||
Self::MAX
|
||||
} else {
|
||||
Self::NEG_INF
|
||||
};
|
||||
}
|
||||
let scaled = (val * Self::SCALE).round();
|
||||
Self::from_rounded_arithmetic(scaled)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn from_f32(val: f32) -> Self {
|
||||
Self::from_f64(val as f64)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn to_f64(self) -> f64 {
|
||||
if self.0 == Self::MAX.0 {
|
||||
return f64::INFINITY;
|
||||
}
|
||||
if self.0 == Self::NEG_INF.0 {
|
||||
return f64::NEG_INFINITY;
|
||||
}
|
||||
self.0 as f64 / Self::SCALE
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub const fn is_infinite(self) -> bool {
|
||||
self.0 == i64::MAX || self.0 == Self::NEG_INF.0
|
||||
}
|
||||
|
||||
/// Convert an f32 distance (lower = closer) to a similarity DeterministicScore
|
||||
/// (higher = better), taking the distance metric into account.
|
||||
#[inline]
|
||||
pub fn similarity_from_distance(distance: f32, metric: DistanceMetric) -> Self {
|
||||
let similarity = match metric {
|
||||
DistanceMetric::Cosine => 1.0 - distance as f64,
|
||||
DistanceMetric::DotProduct => -(distance as f64),
|
||||
DistanceMetric::Euclidean | DistanceMetric::Manhattan => {
|
||||
1.0 / (1.0 + distance as f64)
|
||||
}
|
||||
};
|
||||
Self::from_f64(similarity)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn from_arithmetic_raw(raw: i128) -> Self {
|
||||
if raw >= i64::MAX as i128 {
|
||||
Self::MAX
|
||||
} else if raw <= Self::NEG_INF.0 as i128 {
|
||||
Self::NEG_INF
|
||||
} else {
|
||||
Self(raw as i64)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn from_rounded_arithmetic(raw: f64) -> Self {
|
||||
if raw.is_nan() {
|
||||
Self::ZERO
|
||||
} else if raw.is_sign_positive() && !raw.is_finite() {
|
||||
Self::MAX
|
||||
} else if !raw.is_finite() {
|
||||
Self::NEG_INF
|
||||
} else if raw >= i64::MAX as f64 {
|
||||
Self::MAX
|
||||
} else if raw <= i64::MIN as f64 {
|
||||
Self::NEG_INF
|
||||
} else {
|
||||
Self(raw as i64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for DeterministicScore {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.cmp(&other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for DeterministicScore {
|
||||
#[inline]
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Hash for DeterministicScore {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.0.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DeterministicScore {
|
||||
fn default() -> Self {
|
||||
Self::ZERO
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for DeterministicScore {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self::from_arithmetic_raw(self.0 as i128 + rhs.0 as i128)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for DeterministicScore {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
Self::from_arithmetic_raw(self.0 as i128 - rhs.0 as i128)
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul<i64> for DeterministicScore {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn mul(self, rhs: i64) -> Self::Output {
|
||||
let result = (self.0 as i128).saturating_mul(rhs as i128);
|
||||
Self::from_arithmetic_raw(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul<f64> for DeterministicScore {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn mul(self, rhs: f64) -> Self::Output {
|
||||
if rhs.is_nan() {
|
||||
return Self::ZERO;
|
||||
}
|
||||
let product = (self.0 as f64) * rhs;
|
||||
Self::from_rounded_arithmetic(product.round())
|
||||
}
|
||||
}
|
||||
|
||||
impl Div<i64> for DeterministicScore {
|
||||
type Output = Self;
|
||||
#[inline]
|
||||
fn div(self, rhs: i64) -> Self::Output {
|
||||
if rhs == 0 {
|
||||
return if self.0 == 0 {
|
||||
Self::ZERO
|
||||
} else if self.0 > 0 {
|
||||
Self::MAX
|
||||
} else {
|
||||
Self::NEG_INF
|
||||
};
|
||||
}
|
||||
Self::from_arithmetic_raw(self.0.saturating_div(rhs) as i128)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DeterministicScore {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if *self == Self::MAX {
|
||||
write!(f, "DeterministicScore(+Inf)")
|
||||
} else if *self == Self::NEG_INF {
|
||||
write!(f, "DeterministicScore(-Inf)")
|
||||
} else {
|
||||
write!(f, "DeterministicScore({:.9})", self.to_f64())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeterministicScore {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if *self == Self::MAX {
|
||||
write!(f, "+Inf")
|
||||
} else if *self == Self::NEG_INF {
|
||||
write!(f, "-Inf")
|
||||
} else {
|
||||
write!(f, "{:.6}", self.to_f64())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f64> for DeterministicScore {
|
||||
fn from(val: f64) -> Self {
|
||||
Self::from_f64(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f32> for DeterministicScore {
|
||||
fn from(val: f32) -> Self {
|
||||
Self::from_f32(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DeterministicScore> for f64 {
|
||||
fn from(score: DeterministicScore) -> Self {
|
||||
score.to_f64()
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DeterministicSearchResult
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Search result with deterministic fixed-point scoring.
|
||||
///
|
||||
/// Parallel to `SearchResult` but uses `DeterministicScore` instead of f32.
|
||||
/// Score semantics: higher = more relevant (similarity, not distance).
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeterministicSearchResult {
|
||||
pub id: VectorId,
|
||||
pub score: DeterministicScore,
|
||||
pub vector: Option<Vec<f32>>,
|
||||
pub metadata: Option<std::collections::HashMap<String, serde_json::Value>>,
|
||||
}
|
||||
|
||||
impl SearchResult {
|
||||
/// Convert a distance-based SearchResult into a deterministic similarity result.
|
||||
pub fn to_deterministic(&self, metric: DistanceMetric) -> DeterministicSearchResult {
|
||||
DeterministicSearchResult {
|
||||
id: self.id.clone(),
|
||||
score: DeterministicScore::similarity_from_distance(self.score, metric),
|
||||
vector: self.vector.clone(),
|
||||
metadata: self.metadata.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Deterministic Fusion
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Deterministic Reciprocal Rank Fusion over DeterministicScore.
|
||||
///
|
||||
/// Uses BTreeMap (not HashMap) for iteration-order determinism.
|
||||
/// Accumulates in i128 to prevent overflow. Ties broken by id (ascending).
|
||||
/// Default k=15 per Lean4 proof `ret010_k60_consensus_bias`.
|
||||
pub fn deterministic_rrf(
|
||||
sources: &[Vec<DeterministicSearchResult>],
|
||||
k_param: usize,
|
||||
top_k: usize,
|
||||
) -> Vec<DeterministicSearchResult> {
|
||||
let mut totals: BTreeMap<&VectorId, i128> = BTreeMap::new();
|
||||
|
||||
for source in sources {
|
||||
for (rank_0, result) in source.iter().enumerate() {
|
||||
let rank_1 = rank_0 + 1;
|
||||
let contrib = DeterministicScore::from_f64(1.0 / (k_param + rank_1) as f64);
|
||||
*totals.entry(&result.id).or_default() += contrib.to_raw() as i128;
|
||||
}
|
||||
}
|
||||
|
||||
let mut results: Vec<DeterministicSearchResult> = totals
|
||||
.into_iter()
|
||||
.map(|(id, raw)| {
|
||||
let clamped = raw.clamp(DeterministicScore::NEG_INF.0 as i128, i64::MAX as i128);
|
||||
DeterministicSearchResult {
|
||||
id: id.clone(),
|
||||
score: DeterministicScore::from_raw(clamped as i64),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
results.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.id.cmp(&b.id)));
|
||||
results.truncate(top_k);
|
||||
results
|
||||
}
|
||||
|
||||
/// Deterministic weighted linear combination.
|
||||
///
|
||||
/// Each source gets a weight. Scores are accumulated in i128.
|
||||
pub fn deterministic_weighted(
|
||||
sources: &[(f64, Vec<DeterministicSearchResult>)],
|
||||
top_k: usize,
|
||||
) -> Vec<DeterministicSearchResult> {
|
||||
let mut totals: BTreeMap<&VectorId, i128> = BTreeMap::new();
|
||||
|
||||
for (weight, results) in sources {
|
||||
for result in results {
|
||||
let weighted = result.score * *weight;
|
||||
*totals.entry(&result.id).or_default() += weighted.to_raw() as i128;
|
||||
}
|
||||
}
|
||||
|
||||
let mut results: Vec<DeterministicSearchResult> = totals
|
||||
.into_iter()
|
||||
.map(|(id, raw)| {
|
||||
let clamped = raw.clamp(DeterministicScore::NEG_INF.0 as i128, i64::MAX as i128);
|
||||
DeterministicSearchResult {
|
||||
id: id.clone(),
|
||||
score: DeterministicScore::from_raw(clamped as i64),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
results.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.id.cmp(&b.id)));
|
||||
results.truncate(top_k);
|
||||
results
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// -- Core type ------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn roundtrip_f64() {
|
||||
let s = DeterministicScore::from_f64(0.5);
|
||||
assert!((s.to_f64() - 0.5).abs() < 1e-9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nan_maps_to_zero() {
|
||||
let s = DeterministicScore::from_f64(f64::NAN);
|
||||
assert_eq!(s, DeterministicScore::ZERO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infinity() {
|
||||
assert_eq!(
|
||||
DeterministicScore::from_f64(f64::INFINITY),
|
||||
DeterministicScore::MAX
|
||||
);
|
||||
assert_eq!(
|
||||
DeterministicScore::from_f64(f64::NEG_INFINITY),
|
||||
DeterministicScore::NEG_INF
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn total_ordering() {
|
||||
let a = DeterministicScore::from_f64(0.1);
|
||||
let b = DeterministicScore::from_f64(0.5);
|
||||
let c = DeterministicScore::from_f64(f64::NAN);
|
||||
assert!(a < b);
|
||||
assert_eq!(c, DeterministicScore::ZERO);
|
||||
// NaN-as-zero sorts deterministically
|
||||
assert!(c < b);
|
||||
assert!(c < a || c == a || c > a); // total order, no panic
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saturating_add() {
|
||||
assert_eq!(
|
||||
DeterministicScore::MAX + DeterministicScore::from_raw(1),
|
||||
DeterministicScore::MAX
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saturating_sub() {
|
||||
assert_eq!(
|
||||
DeterministicScore::NEG_INF - DeterministicScore::from_raw(1),
|
||||
DeterministicScore::NEG_INF
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arithmetic_add() {
|
||||
let a = DeterministicScore::from_f64(0.3);
|
||||
let b = DeterministicScore::from_f64(0.4);
|
||||
assert!((a + b).to_f64() - 0.7 < 1e-9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn div_by_zero() {
|
||||
assert_eq!(
|
||||
DeterministicScore::from_f64(0.5) / 0,
|
||||
DeterministicScore::MAX
|
||||
);
|
||||
assert_eq!(
|
||||
DeterministicScore::from_f64(-0.5) / 0,
|
||||
DeterministicScore::NEG_INF
|
||||
);
|
||||
assert_eq!(DeterministicScore::ZERO / 0, DeterministicScore::ZERO);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn raw_scale() {
|
||||
assert_eq!(
|
||||
DeterministicScore::from_f64(1.0).to_raw(),
|
||||
4_294_967_296_i64
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn display_format() {
|
||||
assert_eq!(format!("{}", DeterministicScore::MAX), "+Inf");
|
||||
assert_eq!(format!("{}", DeterministicScore::NEG_INF), "-Inf");
|
||||
let s = format!("{}", DeterministicScore::from_f64(0.1234567));
|
||||
assert_eq!(s, "0.123457");
|
||||
}
|
||||
|
||||
// -- Boundary conversion --------------------------------------------------
|
||||
|
||||
// f32→f64 cast introduces ~1e-7 error, so tolerance must be > f32 epsilon
|
||||
const F32_TOL: f64 = 1e-7;
|
||||
|
||||
#[test]
|
||||
fn cosine_distance_to_similarity() {
|
||||
// cosine distance 0.2 → similarity 0.8
|
||||
let s = DeterministicScore::similarity_from_distance(0.2, DistanceMetric::Cosine);
|
||||
assert!((s.to_f64() - 0.8).abs() < F32_TOL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn euclidean_distance_to_similarity() {
|
||||
// euclidean distance 0.0 → similarity 1.0
|
||||
let s = DeterministicScore::similarity_from_distance(0.0, DistanceMetric::Euclidean);
|
||||
assert!((s.to_f64() - 1.0).abs() < F32_TOL);
|
||||
// euclidean distance 1.0 → similarity 0.5
|
||||
let s = DeterministicScore::similarity_from_distance(1.0, DistanceMetric::Euclidean);
|
||||
assert!((s.to_f64() - 0.5).abs() < F32_TOL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot_product_distance_to_similarity() {
|
||||
// dot_product_distance stores -dot, so distance = -5.0 means dot = 5.0
|
||||
let s = DeterministicScore::similarity_from_distance(-5.0, DistanceMetric::DotProduct);
|
||||
assert!((s.to_f64() - 5.0).abs() < F32_TOL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_result_to_deterministic() {
|
||||
let result = SearchResult {
|
||||
id: "doc1".to_string(),
|
||||
score: 0.1, // cosine distance
|
||||
vector: None,
|
||||
metadata: None,
|
||||
};
|
||||
let det = result.to_deterministic(DistanceMetric::Cosine);
|
||||
assert_eq!(det.id, "doc1");
|
||||
assert!((det.score.to_f64() - 0.9).abs() < F32_TOL);
|
||||
}
|
||||
|
||||
// -- f32→i64 order preservation (Lean4: score_003_order_preservation) -----
|
||||
|
||||
#[test]
|
||||
fn f32_to_i64_order_preservation() {
|
||||
let distances: Vec<f32> = vec![0.01, 0.05, 0.1, 0.2, 0.5, 0.9, 1.5];
|
||||
let scores: Vec<DeterministicScore> = distances
|
||||
.iter()
|
||||
.map(|&d| DeterministicScore::similarity_from_distance(d, DistanceMetric::Cosine))
|
||||
.collect();
|
||||
// Cosine: smaller distance → higher similarity → higher score
|
||||
for i in 0..scores.len() - 1 {
|
||||
assert!(
|
||||
scores[i] > scores[i + 1],
|
||||
"Order violation at index {}: {:?} should be > {:?}",
|
||||
i,
|
||||
scores[i],
|
||||
scores[i + 1]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// -- Deterministic RRF ----------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn rrf_basic() {
|
||||
let source_a = vec![
|
||||
DeterministicSearchResult {
|
||||
id: "a".into(),
|
||||
score: DeterministicScore::from_f64(0.9),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
},
|
||||
DeterministicSearchResult {
|
||||
id: "b".into(),
|
||||
score: DeterministicScore::from_f64(0.8),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
},
|
||||
];
|
||||
let source_b = vec![
|
||||
DeterministicSearchResult {
|
||||
id: "b".into(),
|
||||
score: DeterministicScore::from_f64(0.95),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
},
|
||||
DeterministicSearchResult {
|
||||
id: "c".into(),
|
||||
score: DeterministicScore::from_f64(0.7),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
},
|
||||
];
|
||||
let results = deterministic_rrf(&[source_a, source_b], 15, 10);
|
||||
// "b" appears at rank 1 in both → highest RRF
|
||||
assert_eq!(results[0].id, "b");
|
||||
assert_eq!(results.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rrf_commutative() {
|
||||
// Source order should not affect results (Lean4: rrf_sum_comm)
|
||||
let s1 = vec![DeterministicSearchResult {
|
||||
id: "x".into(),
|
||||
score: DeterministicScore::from_f64(0.9),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
let s2 = vec![DeterministicSearchResult {
|
||||
id: "y".into(),
|
||||
score: DeterministicScore::from_f64(0.8),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
|
||||
let forward = deterministic_rrf(&[s1.clone(), s2.clone()], 15, 10);
|
||||
let reverse = deterministic_rrf(&[s2, s1], 15, 10);
|
||||
|
||||
assert_eq!(forward.len(), reverse.len());
|
||||
for (a, b) in forward.iter().zip(reverse.iter()) {
|
||||
assert_eq!(a.id, b.id);
|
||||
assert_eq!(a.score, b.score);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rrf_tie_broken_by_id() {
|
||||
let s1 = vec![DeterministicSearchResult {
|
||||
id: "beta".into(),
|
||||
score: DeterministicScore::from_f64(0.5),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
let s2 = vec![DeterministicSearchResult {
|
||||
id: "alpha".into(),
|
||||
score: DeterministicScore::from_f64(0.5),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
let results = deterministic_rrf(&[s1, s2], 15, 10);
|
||||
// Same RRF score (both rank 1 in one source), tie broken by id ascending
|
||||
assert_eq!(results[0].id, "alpha");
|
||||
assert_eq!(results[1].id, "beta");
|
||||
}
|
||||
|
||||
// -- Deterministic weighted -----------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn weighted_basic() {
|
||||
let dense = vec![DeterministicSearchResult {
|
||||
id: "a".into(),
|
||||
score: DeterministicScore::from_f64(0.8),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
let sparse = vec![DeterministicSearchResult {
|
||||
id: "a".into(),
|
||||
score: DeterministicScore::from_f64(0.6),
|
||||
vector: None,
|
||||
metadata: None,
|
||||
}];
|
||||
let results = deterministic_weighted(&[(0.7, dense), (0.3, sparse)], 10);
|
||||
assert_eq!(results.len(), 1);
|
||||
// 0.7 * 0.8 + 0.3 * 0.6 = 0.56 + 0.18 = 0.74
|
||||
assert!((results[0].score.to_f64() - 0.74).abs() < 1e-6);
|
||||
}
|
||||
}
|
||||
|
|
@ -4,8 +4,9 @@ pub mod flat;
|
|||
#[cfg(feature = "hnsw")]
|
||||
pub mod hnsw;
|
||||
|
||||
use crate::deterministic_score::DeterministicSearchResult;
|
||||
use crate::error::Result;
|
||||
use crate::types::{SearchResult, VectorId};
|
||||
use crate::types::{DistanceMetric, SearchResult, VectorId};
|
||||
|
||||
/// Trait for vector index implementations
|
||||
pub trait VectorIndex: Send + Sync {
|
||||
|
|
@ -20,9 +21,26 @@ pub trait VectorIndex: Send + Sync {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Search for k nearest neighbors
|
||||
/// Search for k nearest neighbors (returns f32 distances)
|
||||
fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>>;
|
||||
|
||||
/// Search with deterministic scoring.
|
||||
///
|
||||
/// Converts f32 distances from the kernel into DeterministicScore similarities
|
||||
/// at the boundary. Everything above this point uses total-ordered i64 scores.
|
||||
fn search_deterministic(
|
||||
&self,
|
||||
query: &[f32],
|
||||
k: usize,
|
||||
) -> Result<Vec<DeterministicSearchResult>> {
|
||||
let metric = self.metric();
|
||||
let results = self.search(query, k)?;
|
||||
Ok(results.iter().map(|r| r.to_deterministic(metric)).collect())
|
||||
}
|
||||
|
||||
/// The distance metric used by this index.
|
||||
fn metric(&self) -> DistanceMetric;
|
||||
|
||||
/// Remove a vector from the index
|
||||
fn remove(&mut self, id: &VectorId) -> Result<bool>;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
//! Flat (brute-force) index for baseline and small datasets
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::distance::distance;
|
||||
use crate::error::Result;
|
||||
use crate::index::VectorIndex;
|
||||
|
|
@ -60,8 +62,10 @@ impl VectorIndex for FlatIndex {
|
|||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
// Sort by distance and take top k
|
||||
results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
||||
// Sort by distance — NaN-safe total ordering (NaN sorts last)
|
||||
results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or_else(|| {
|
||||
if a.1.is_nan() { Ordering::Greater } else { Ordering::Less }
|
||||
}));
|
||||
results.truncate(k);
|
||||
|
||||
Ok(results
|
||||
|
|
@ -75,6 +79,10 @@ impl VectorIndex for FlatIndex {
|
|||
.collect())
|
||||
}
|
||||
|
||||
fn metric(&self) -> DistanceMetric {
|
||||
self.metric
|
||||
}
|
||||
|
||||
fn remove(&mut self, id: &VectorId) -> Result<bool> {
|
||||
Ok(self.vectors.remove(id).is_some())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -333,6 +333,10 @@ impl VectorIndex for HnswIndex {
|
|||
self.search_with_ef(query, k, self.config.ef_search)
|
||||
}
|
||||
|
||||
fn metric(&self) -> DistanceMetric {
|
||||
self.metric
|
||||
}
|
||||
|
||||
fn remove(&mut self, id: &VectorId) -> Result<bool> {
|
||||
let inner = self.inner.write();
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@
|
|||
#![allow(clippy::incompatible_msrv)]
|
||||
|
||||
pub mod advanced_features;
|
||||
pub mod deterministic_score;
|
||||
|
||||
// AgenticDB requires storage feature
|
||||
#[cfg(feature = "storage")]
|
||||
|
|
@ -108,6 +109,9 @@ const _: () = {
|
|||
let _ = AGENTICDB_EMBEDDING_WARNING;
|
||||
};
|
||||
|
||||
pub use deterministic_score::{
|
||||
deterministic_rrf, deterministic_weighted, DeterministicScore, DeterministicSearchResult,
|
||||
};
|
||||
pub use error::{Result, RuvectorError};
|
||||
pub use types::{DistanceMetric, SearchQuery, SearchResult, VectorEntry, VectorId};
|
||||
pub use vector_db::VectorDB;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue