Search index build optimizations:

- For ascii bigrams, store song occurences in a vec instead of a hashmap to save on hashing costs (~10% faster). Use ascii values to build vec indices. - For all bigrams, replace IntMaps with Vec. This allows the same song to be counted multiple times for one bigram, but saves a huge amount of hashing cost (~30%)
2025-01-01 15:57:33 -08:00 · 2025-01-01 15:57:33 -08:00 · de39b2f4a5
commit de39b2f4a5
parent cb241d21dd
1 changed files with 52 additions and 31 deletions
--- a/src/app/index/search.rs
+++ b/src/app/index/search.rs
@ -1,7 +1,7 @@
 use chumsky::Parser;
 use enum_map::EnumMap;
 use lasso2::Spur;
-use nohash_hasher::{IntMap, IntSet};
+use nohash_hasher::IntSet;
 use serde::{Deserialize, Serialize};
 use std::{
 	cmp::Ordering,
@ -97,9 +97,9 @@ impl Search {
 		f: &Box<Expr>,
 	) -> IntSet<SongKey> {
 		let is_operable = |expr: &Expr| match expr {
-			Expr::Fuzzy(Literal::Text(s)) if s.chars().count() < NGRAM_SIZE => false,
+			Expr::Fuzzy(Literal::Text(s)) if s.chars().count() < BIGRAM_SIZE => false,
 			Expr::Fuzzy(Literal::Number(n)) if *n < 10 => false,
-			Expr::TextCmp(_, _, s) if s.chars().count() < NGRAM_SIZE => false,
+			Expr::TextCmp(_, _, s) if s.chars().count() < BIGRAM_SIZE => false,
 			_ => true,
 		};
@ -162,55 +162,76 @@ impl Search {
 	}
 }
-const NGRAM_SIZE: usize = 2;
+const BIGRAM_SIZE: usize = 2;
 const ASCII_RANGE: usize = u8::MAX as usize;
-#[derive(Clone, Default, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize)]
 struct TextFieldIndex {
 	exact: HashMap<Spur, IntSet<SongKey>>,
-	ngrams: HashMap<[char; NGRAM_SIZE], IntMap<SongKey, Spur>>,
+	ascii_bigrams: Vec<Vec<(SongKey, Spur)>>,
 	other_bigrams: HashMap<[char; BIGRAM_SIZE], Vec<(SongKey, Spur)>>,
 }
 impl Default for TextFieldIndex {
 	fn default() -> Self {
 		Self {
 			exact: Default::default(),
 			ascii_bigrams: vec![Default::default(); ASCII_RANGE * ASCII_RANGE],
 			other_bigrams: Default::default(),
 		}
 	}
 }
 impl TextFieldIndex {
-	pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
+	fn ascii_bigram_to_index(a: char, b: char) -> usize {
 		assert!(a.is_ascii());
 		assert!(b.is_ascii());
 		(a as usize) * ASCII_RANGE + (b as usize) as usize
 	}
 	pub fn insert(&mut self, raw_value: &str, value: Spur, song: SongKey) {
 		let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
-		for substring in characters[..].windows(NGRAM_SIZE) {
+		for substring in characters[..].windows(BIGRAM_SIZE) {
-			self.ngrams
+			if substring.iter().all(|c| c.is_ascii()) {
-				.entry(substring.try_into().unwrap())
+				let index = Self::ascii_bigram_to_index(substring[0], substring[1]);
-				.or_default()
+				self.ascii_bigrams[index].push((song, value));
-				.insert(key, value);
+			} else {
 				self.other_bigrams
 					.entry(substring.try_into().unwrap())
 					.or_default()
 					.push((song, value));
 			}
 		}
-		self.exact.entry(value).or_default().insert(key);
+		self.exact.entry(value).or_default().insert(song);
 	}
 	pub fn find_like(&self, dictionary: &Dictionary, value: &str) -> IntSet<SongKey> {
 		let sanitized = sanitize(value);
 		let characters = sanitized.chars().collect::<Vec<_>>();
-		let empty = IntMap::default();
+		let empty = Vec::new();
-		let mut candidates = characters[..]
+		let candidates_by_bigram = characters[..]
-			.windows(NGRAM_SIZE)
+			.windows(BIGRAM_SIZE)
 			.map(|s| {
-				self.ngrams
+				if s.iter().all(|c| c.is_ascii()) {
-					.get::<[char; NGRAM_SIZE]>(s.try_into().unwrap())
+					let index = Self::ascii_bigram_to_index(s[0], s[1]);
-					.unwrap_or(&empty)
+					&self.ascii_bigrams[index]
 				} else {
 					self.other_bigrams
 						.get::<[char; BIGRAM_SIZE]>(s.try_into().unwrap())
 						.unwrap_or(&empty)
 				}
 			})
 			.collect::<Vec<_>>();
-		if candidates.is_empty() {
+		candidates_by_bigram
-			return IntSet::default();
+			.into_iter()
-		}
+			.min_by_key(|h| h.len()) // Only check songs that contain the least common bigram from the search term
-
+			.unwrap_or(&empty)
 		candidates.sort_by_key(|h| h.len());
 		candidates[0]
 			.iter()
 			// [broad phase] Only keep songs that match all bigrams from the search term
 			.filter(move |(song_key, _indexed_value)| {
 				candidates[1..].iter().all(|c| c.contains_key(&song_key))
 			})
 			// [narrow phase] Only keep songs that actually contain the search term in full
 			.filter(|(_song_key, indexed_value)| {
 				// Only keep songs that actually contain the search term in full
 				let resolved = dictionary.resolve(indexed_value);
 				sanitize(resolved).contains(&sanitized)
 			})