Search index build optimizations:
- For ascii bigrams, store song occurences in a vec instead of a hashmap to save on hashing costs (~10% faster). Use ascii values to build vec indices. - For all bigrams, replace IntMaps with Vec. This allows the same song to be counted multiple times for one bigram, but saves a huge amount of hashing cost (~30%)
This commit is contained in:
parent
cb241d21dd
commit
de39b2f4a5
1 changed files with 52 additions and 31 deletions
|
@ -1,7 +1,7 @@
|
||||||
use chumsky::Parser;
|
use chumsky::Parser;
|
||||||
use enum_map::EnumMap;
|
use enum_map::EnumMap;
|
||||||
use lasso2::Spur;
|
use lasso2::Spur;
|
||||||
use nohash_hasher::{IntMap, IntSet};
|
use nohash_hasher::IntSet;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{
|
use std::{
|
||||||
cmp::Ordering,
|
cmp::Ordering,
|
||||||
|
@ -97,9 +97,9 @@ impl Search {
|
||||||
f: &Box<Expr>,
|
f: &Box<Expr>,
|
||||||
) -> IntSet<SongKey> {
|
) -> IntSet<SongKey> {
|
||||||
let is_operable = |expr: &Expr| match expr {
|
let is_operable = |expr: &Expr| match expr {
|
||||||
Expr::Fuzzy(Literal::Text(s)) if s.chars().count() < NGRAM_SIZE => false,
|
Expr::Fuzzy(Literal::Text(s)) if s.chars().count() < BIGRAM_SIZE => false,
|
||||||
Expr::Fuzzy(Literal::Number(n)) if *n < 10 => false,
|
Expr::Fuzzy(Literal::Number(n)) if *n < 10 => false,
|
||||||
Expr::TextCmp(_, _, s) if s.chars().count() < NGRAM_SIZE => false,
|
Expr::TextCmp(_, _, s) if s.chars().count() < BIGRAM_SIZE => false,
|
||||||
_ => true,
|
_ => true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -162,55 +162,76 @@ impl Search {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const NGRAM_SIZE: usize = 2;
|
const BIGRAM_SIZE: usize = 2;
|
||||||
|
const ASCII_RANGE: usize = u8::MAX as usize;
|
||||||
|
|
||||||
#[derive(Clone, Default, Deserialize, Serialize)]
|
#[derive(Clone, Deserialize, Serialize)]
|
||||||
struct TextFieldIndex {
|
struct TextFieldIndex {
|
||||||
exact: HashMap<Spur, IntSet<SongKey>>,
|
exact: HashMap<Spur, IntSet<SongKey>>,
|
||||||
ngrams: HashMap<[char; NGRAM_SIZE], IntMap<SongKey, Spur>>,
|
ascii_bigrams: Vec<Vec<(SongKey, Spur)>>,
|
||||||
|
other_bigrams: HashMap<[char; BIGRAM_SIZE], Vec<(SongKey, Spur)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TextFieldIndex {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
exact: Default::default(),
|
||||||
|
ascii_bigrams: vec![Default::default(); ASCII_RANGE * ASCII_RANGE],
|
||||||
|
other_bigrams: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TextFieldIndex {
|
impl TextFieldIndex {
|
||||||
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
|
fn ascii_bigram_to_index(a: char, b: char) -> usize {
|
||||||
|
assert!(a.is_ascii());
|
||||||
|
assert!(b.is_ascii());
|
||||||
|
(a as usize) * ASCII_RANGE + (b as usize) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert(&mut self, raw_value: &str, value: Spur, song: SongKey) {
|
||||||
let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
|
let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
|
||||||
for substring in characters[..].windows(NGRAM_SIZE) {
|
for substring in characters[..].windows(BIGRAM_SIZE) {
|
||||||
self.ngrams
|
if substring.iter().all(|c| c.is_ascii()) {
|
||||||
.entry(substring.try_into().unwrap())
|
let index = Self::ascii_bigram_to_index(substring[0], substring[1]);
|
||||||
.or_default()
|
self.ascii_bigrams[index].push((song, value));
|
||||||
.insert(key, value);
|
} else {
|
||||||
|
self.other_bigrams
|
||||||
|
.entry(substring.try_into().unwrap())
|
||||||
|
.or_default()
|
||||||
|
.push((song, value));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.exact.entry(value).or_default().insert(key);
|
self.exact.entry(value).or_default().insert(song);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn find_like(&self, dictionary: &Dictionary, value: &str) -> IntSet<SongKey> {
|
pub fn find_like(&self, dictionary: &Dictionary, value: &str) -> IntSet<SongKey> {
|
||||||
let sanitized = sanitize(value);
|
let sanitized = sanitize(value);
|
||||||
let characters = sanitized.chars().collect::<Vec<_>>();
|
let characters = sanitized.chars().collect::<Vec<_>>();
|
||||||
let empty = IntMap::default();
|
let empty = Vec::new();
|
||||||
|
|
||||||
let mut candidates = characters[..]
|
let candidates_by_bigram = characters[..]
|
||||||
.windows(NGRAM_SIZE)
|
.windows(BIGRAM_SIZE)
|
||||||
.map(|s| {
|
.map(|s| {
|
||||||
self.ngrams
|
if s.iter().all(|c| c.is_ascii()) {
|
||||||
.get::<[char; NGRAM_SIZE]>(s.try_into().unwrap())
|
let index = Self::ascii_bigram_to_index(s[0], s[1]);
|
||||||
.unwrap_or(&empty)
|
&self.ascii_bigrams[index]
|
||||||
|
} else {
|
||||||
|
self.other_bigrams
|
||||||
|
.get::<[char; BIGRAM_SIZE]>(s.try_into().unwrap())
|
||||||
|
.unwrap_or(&empty)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if candidates.is_empty() {
|
candidates_by_bigram
|
||||||
return IntSet::default();
|
.into_iter()
|
||||||
}
|
.min_by_key(|h| h.len()) // Only check songs that contain the least common bigram from the search term
|
||||||
|
.unwrap_or(&empty)
|
||||||
candidates.sort_by_key(|h| h.len());
|
|
||||||
|
|
||||||
candidates[0]
|
|
||||||
.iter()
|
.iter()
|
||||||
// [broad phase] Only keep songs that match all bigrams from the search term
|
|
||||||
.filter(move |(song_key, _indexed_value)| {
|
|
||||||
candidates[1..].iter().all(|c| c.contains_key(&song_key))
|
|
||||||
})
|
|
||||||
// [narrow phase] Only keep songs that actually contain the search term in full
|
|
||||||
.filter(|(_song_key, indexed_value)| {
|
.filter(|(_song_key, indexed_value)| {
|
||||||
|
// Only keep songs that actually contain the search term in full
|
||||||
let resolved = dictionary.resolve(indexed_value);
|
let resolved = dictionary.resolve(indexed_value);
|
||||||
sanitize(resolved).contains(&sanitized)
|
sanitize(resolved).contains(&sanitized)
|
||||||
})
|
})
|
||||||
|
|
Loading…
Add table
Reference in a new issue