Avoid false positives when all bigrams match
This commit is contained in:
parent
bdc4f840a4
commit
0fe3555560
1 changed files with 33 additions and 16 deletions
|
@ -1,6 +1,6 @@
|
||||||
use chumsky::Parser;
|
use chumsky::Parser;
|
||||||
use lasso2::{RodeoReader, Spur};
|
use lasso2::{RodeoReader, Spur};
|
||||||
use nohash_hasher::IntSet;
|
use nohash_hasher::{IntMap, IntSet};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{
|
use std::{
|
||||||
collections::{HashMap, HashSet},
|
collections::{HashMap, HashSet},
|
||||||
|
@ -64,7 +64,7 @@ impl Search {
|
||||||
) -> IntSet<SongKey> {
|
) -> IntSet<SongKey> {
|
||||||
match expr {
|
match expr {
|
||||||
Expr::Fuzzy(s) => self.eval_fuzzy(strings, s),
|
Expr::Fuzzy(s) => self.eval_fuzzy(strings, s),
|
||||||
Expr::TextCmp(field, op, s) => self.eval_text_operator(canon, *field, *op, &s),
|
Expr::TextCmp(field, op, s) => self.eval_text_operator(strings, canon, *field, *op, &s),
|
||||||
Expr::NumberCmp(field, op, n) => self.eval_number_operator(*field, *op, *n),
|
Expr::NumberCmp(field, op, n) => self.eval_number_operator(*field, *op, *n),
|
||||||
Expr::Combined(e, op, f) => self.combine(strings, canon, e, *op, f),
|
Expr::Combined(e, op, f) => self.combine(strings, canon, e, *op, f),
|
||||||
}
|
}
|
||||||
|
@ -97,7 +97,7 @@ impl Search {
|
||||||
Literal::Text(s) => {
|
Literal::Text(s) => {
|
||||||
let mut songs = IntSet::default();
|
let mut songs = IntSet::default();
|
||||||
for field in self.text_fields.values() {
|
for field in self.text_fields.values() {
|
||||||
songs.extend(field.find_like(s));
|
songs.extend(field.find_like(strings, s));
|
||||||
}
|
}
|
||||||
songs
|
songs
|
||||||
}
|
}
|
||||||
|
@ -116,6 +116,7 @@ impl Search {
|
||||||
|
|
||||||
fn eval_text_operator(
|
fn eval_text_operator(
|
||||||
&self,
|
&self,
|
||||||
|
strings: &RodeoReader,
|
||||||
canon: &HashMap<String, Spur>,
|
canon: &HashMap<String, Spur>,
|
||||||
field: TextField,
|
field: TextField,
|
||||||
operator: TextOp,
|
operator: TextOp,
|
||||||
|
@ -127,7 +128,7 @@ impl Search {
|
||||||
|
|
||||||
match operator {
|
match operator {
|
||||||
TextOp::Eq => field_index.find_exact(canon, value),
|
TextOp::Eq => field_index.find_exact(canon, value),
|
||||||
TextOp::Like => field_index.find_like(value),
|
TextOp::Like => field_index.find_like(strings, value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,34 +147,33 @@ const NGRAM_SIZE: usize = 2;
|
||||||
#[derive(Default, Deserialize, Serialize)]
|
#[derive(Default, Deserialize, Serialize)]
|
||||||
struct TextFieldIndex {
|
struct TextFieldIndex {
|
||||||
exact: HashMap<Spur, IntSet<SongKey>>,
|
exact: HashMap<Spur, IntSet<SongKey>>,
|
||||||
ngrams: HashMap<[char; NGRAM_SIZE], IntSet<SongKey>>,
|
ngrams: HashMap<[char; NGRAM_SIZE], IntMap<SongKey, Spur>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TextFieldIndex {
|
impl TextFieldIndex {
|
||||||
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
|
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
|
||||||
// TODO sanitize ngrams to be case insensitive, free from diacritics and punctuation
|
|
||||||
// And do the same thing to query fragments!
|
|
||||||
let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
|
let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
|
||||||
for substring in characters[..].windows(NGRAM_SIZE) {
|
for substring in characters[..].windows(NGRAM_SIZE) {
|
||||||
self.ngrams
|
self.ngrams
|
||||||
.entry(substring.try_into().unwrap())
|
.entry(substring.try_into().unwrap())
|
||||||
.or_default()
|
.or_default()
|
||||||
.insert(key);
|
.insert(key, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
self.exact.entry(value).or_default().insert(key);
|
self.exact.entry(value).or_default().insert(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn find_like(&self, value: &str) -> IntSet<SongKey> {
|
pub fn find_like(&self, strings: &RodeoReader, value: &str) -> IntSet<SongKey> {
|
||||||
let characters = sanitize(value).chars().collect::<Vec<_>>();
|
let sanitized = sanitize(value);
|
||||||
let empty_set = IntSet::default();
|
let characters = sanitized.chars().collect::<Vec<_>>();
|
||||||
|
let empty = IntMap::default();
|
||||||
|
|
||||||
let mut candidates = characters[..]
|
let mut candidates = characters[..]
|
||||||
.windows(NGRAM_SIZE)
|
.windows(NGRAM_SIZE)
|
||||||
.map(|s| {
|
.map(|s| {
|
||||||
self.ngrams
|
self.ngrams
|
||||||
.get::<[char; NGRAM_SIZE]>(s.try_into().unwrap())
|
.get::<[char; NGRAM_SIZE]>(s.try_into().unwrap())
|
||||||
.unwrap_or(&empty_set)
|
.unwrap_or(&empty)
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
@ -185,10 +185,16 @@ impl TextFieldIndex {
|
||||||
|
|
||||||
candidates[0]
|
candidates[0]
|
||||||
.iter()
|
.iter()
|
||||||
.filter(move |c| candidates[1..].iter().all(|s| s.contains(c)))
|
// [broad phase] Only keep songs that match all bigrams from the search term
|
||||||
// Note: matching all the n-grams doesn't actually guarantee a substring match
|
.filter(move |(song_key, _indexed_value)| {
|
||||||
// We should theoretically resolve the underlying field value and compare with the query string
|
candidates[1..].iter().all(|c| c.contains_key(&song_key))
|
||||||
// Unlikely to cause issues for realistic use cases.
|
})
|
||||||
|
// [narrow phase] Only keep songs that actually contain the search term in full
|
||||||
|
.filter(|(_song_key, indexed_value)| {
|
||||||
|
let resolved = strings.resolve(indexed_value);
|
||||||
|
sanitize(resolved).contains(&sanitized)
|
||||||
|
})
|
||||||
|
.map(|(k, _v)| k)
|
||||||
.copied()
|
.copied()
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
@ -490,4 +496,15 @@ mod test {
|
||||||
assert!(songs.contains(&PathBuf::from("space.mp3")));
|
assert!(songs.contains(&PathBuf::from("space.mp3")));
|
||||||
assert!(songs.contains(&PathBuf::from("whales in space.mp3")));
|
assert!(songs.contains(&PathBuf::from("whales in space.mp3")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn avoids_bigram_false_positives() {
|
||||||
|
let (search, strings, canon) = setup_test(vec![scanner::Song {
|
||||||
|
virtual_path: PathBuf::from("lorry bovine vehicle.mp3"),
|
||||||
|
..Default::default()
|
||||||
|
}]);
|
||||||
|
|
||||||
|
let songs = search.find_songs(&strings, &canon, "love").unwrap();
|
||||||
|
assert!(songs.is_empty());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue