Case insensitive search

This commit is contained in:
Antoine Gersant 2024-09-22 00:22:54 -07:00
parent 409d79d8a2
commit bdc4f840a4
4 changed files with 100 additions and 49 deletions

View file

@ -259,7 +259,9 @@ impl Manager {
let index_manager = self.clone(); let index_manager = self.clone();
move || { move || {
let index = index_manager.index.read().unwrap(); let index = index_manager.index.read().unwrap();
index.search.find_songs(&index.strings, &query) index
.search
.find_songs(&index.strings, &index.canon, &query)
} }
}) })
.await .await
@ -270,6 +272,7 @@ impl Manager {
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct Index { pub struct Index {
pub strings: RodeoReader, pub strings: RodeoReader,
pub canon: HashMap<String, Spur>,
pub browser: browser::Browser, pub browser: browser::Browser,
pub collection: collection::Collection, pub collection: collection::Collection,
pub search: search::Search, pub search: search::Search,
@ -279,6 +282,7 @@ impl Default for Index {
fn default() -> Self { fn default() -> Self {
Self { Self {
strings: Rodeo::new().into_reader(), strings: Rodeo::new().into_reader(),
canon: Default::default(),
browser: Default::default(), browser: Default::default(),
collection: Default::default(), collection: Default::default(),
search: Default::default(), search: Default::default(),
@ -288,7 +292,7 @@ impl Default for Index {
pub struct Builder { pub struct Builder {
strings: Rodeo, strings: Rodeo,
minuscules: HashMap<String, Spur>, canon: HashMap<String, Spur>,
browser_builder: browser::Builder, browser_builder: browser::Builder,
collection_builder: collection::Builder, collection_builder: collection::Builder,
search_builder: search::Builder, search_builder: search::Builder,
@ -298,7 +302,7 @@ impl Builder {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
strings: Rodeo::new(), strings: Rodeo::new(),
minuscules: HashMap::default(), canon: HashMap::default(),
browser_builder: browser::Builder::default(), browser_builder: browser::Builder::default(),
collection_builder: collection::Builder::default(), collection_builder: collection::Builder::default(),
search_builder: search::Builder::default(), search_builder: search::Builder::default(),
@ -311,9 +315,7 @@ impl Builder {
} }
pub fn add_song(&mut self, scanner_song: scanner::Song) { pub fn add_song(&mut self, scanner_song: scanner::Song) {
if let Some(storage_song) = if let Some(storage_song) = store_song(&mut self.strings, &mut self.canon, &scanner_song) {
store_song(&mut self.strings, &mut self.minuscules, &scanner_song)
{
self.browser_builder self.browser_builder
.add_song(&mut self.strings, &scanner_song); .add_song(&mut self.strings, &scanner_song);
self.collection_builder.add_song(&storage_song); self.collection_builder.add_song(&storage_song);
@ -327,6 +329,7 @@ impl Builder {
collection: self.collection_builder.build(), collection: self.collection_builder.build(),
search: self.search_builder.build(), search: self.search_builder.build(),
strings: self.strings.into_reader(), strings: self.strings.into_reader(),
canon: self.canon,
} }
} }
} }

View file

@ -374,11 +374,11 @@ mod test {
fn setup_test(songs: Vec<scanner::Song>) -> (Collection, RodeoReader) { fn setup_test(songs: Vec<scanner::Song>) -> (Collection, RodeoReader) {
let mut strings = Rodeo::new(); let mut strings = Rodeo::new();
let mut minuscules = HashMap::new(); let mut canon = HashMap::new();
let mut builder = Builder::default(); let mut builder = Builder::default();
for song in songs { for song in songs {
let song = store_song(&mut strings, &mut minuscules, &song).unwrap(); let song = store_song(&mut strings, &mut canon, &song).unwrap();
builder.add_song(&song); builder.add_song(&song);
} }

View file

@ -17,7 +17,10 @@ use crate::app::{
scanner, Error, scanner, Error,
}; };
use super::{query::make_parser, storage}; use super::{
query::make_parser,
storage::{self, sanitize},
};
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct Search { pub struct Search {
@ -35,44 +38,55 @@ impl Default for Search {
} }
impl Search { impl Search {
pub fn find_songs(&self, strings: &RodeoReader, query: &str) -> Result<Vec<PathBuf>, Error> { pub fn find_songs(
&self,
strings: &RodeoReader,
canon: &HashMap<String, Spur>,
query: &str,
) -> Result<Vec<PathBuf>, Error> {
let parser = make_parser(); let parser = make_parser();
let parsed_query = parser let parsed_query = parser
.parse(query) .parse(query)
.map_err(|_| Error::SearchQueryParseError)?; .map_err(|_| Error::SearchQueryParseError)?;
let keys = self.eval(strings, &parsed_query); let keys = self.eval(strings, canon, &parsed_query);
Ok(keys Ok(keys
.into_iter() .into_iter()
.map(|k| Path::new(OsStr::new(strings.resolve(&k.virtual_path.0))).to_owned()) .map(|k| Path::new(OsStr::new(strings.resolve(&k.virtual_path.0))).to_owned())
.collect::<Vec<_>>()) .collect::<Vec<_>>())
} }
fn eval(&self, strings: &RodeoReader, expr: &Expr) -> IntSet<SongKey> { fn eval(
&self,
strings: &RodeoReader,
canon: &HashMap<String, Spur>,
expr: &Expr,
) -> IntSet<SongKey> {
match expr { match expr {
Expr::Fuzzy(s) => self.eval_fuzzy(strings, s), Expr::Fuzzy(s) => self.eval_fuzzy(strings, s),
Expr::TextCmp(field, op, s) => self.eval_text_operator(strings, *field, *op, &s), Expr::TextCmp(field, op, s) => self.eval_text_operator(canon, *field, *op, &s),
Expr::NumberCmp(field, op, n) => self.eval_number_operator(*field, *op, *n), Expr::NumberCmp(field, op, n) => self.eval_number_operator(*field, *op, *n),
Expr::Combined(e, op, f) => self.combine(strings, e, *op, f), Expr::Combined(e, op, f) => self.combine(strings, canon, e, *op, f),
} }
} }
fn combine( fn combine(
&self, &self,
strings: &RodeoReader, strings: &RodeoReader,
canon: &HashMap<String, Spur>,
e: &Box<Expr>, e: &Box<Expr>,
op: BoolOp, op: BoolOp,
f: &Box<Expr>, f: &Box<Expr>,
) -> IntSet<SongKey> { ) -> IntSet<SongKey> {
match op { match op {
BoolOp::And => self BoolOp::And => self
.eval(strings, e) .eval(strings, canon, e)
.intersection(&self.eval(strings, f)) .intersection(&self.eval(strings, canon, f))
.cloned() .cloned()
.collect(), .collect(),
BoolOp::Or => self BoolOp::Or => self
.eval(strings, e) .eval(strings, canon, e)
.union(&self.eval(strings, f)) .union(&self.eval(strings, canon, f))
.cloned() .cloned()
.collect(), .collect(),
} }
@ -102,7 +116,7 @@ impl Search {
fn eval_text_operator( fn eval_text_operator(
&self, &self,
strings: &RodeoReader, canon: &HashMap<String, Spur>,
field: TextField, field: TextField,
operator: TextOp, operator: TextOp,
value: &str, value: &str,
@ -112,7 +126,7 @@ impl Search {
}; };
match operator { match operator {
TextOp::Eq => field_index.find_exact(strings, value), TextOp::Eq => field_index.find_exact(canon, value),
TextOp::Like => field_index.find_like(value), TextOp::Like => field_index.find_like(value),
} }
} }
@ -139,7 +153,7 @@ impl TextFieldIndex {
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) { pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
// TODO sanitize ngrams to be case insensitive, free from diacritics and punctuation // TODO sanitize ngrams to be case insensitive, free from diacritics and punctuation
// And do the same thing to query fragments! // And do the same thing to query fragments!
let characters = raw_value.chars().collect::<TinyVec<[char; 32]>>(); let characters = sanitize(raw_value).chars().collect::<TinyVec<[char; 32]>>();
for substring in characters[..].windows(NGRAM_SIZE) { for substring in characters[..].windows(NGRAM_SIZE) {
self.ngrams self.ngrams
.entry(substring.try_into().unwrap()) .entry(substring.try_into().unwrap())
@ -151,7 +165,7 @@ impl TextFieldIndex {
} }
pub fn find_like(&self, value: &str) -> IntSet<SongKey> { pub fn find_like(&self, value: &str) -> IntSet<SongKey> {
let characters = value.chars().collect::<Vec<_>>(); let characters = sanitize(value).chars().collect::<Vec<_>>();
let empty_set = IntSet::default(); let empty_set = IntSet::default();
let mut candidates = characters[..] let mut candidates = characters[..]
@ -179,10 +193,10 @@ impl TextFieldIndex {
.collect() .collect()
} }
pub fn find_exact(&self, strings: &RodeoReader, value: &str) -> IntSet<SongKey> { pub fn find_exact(&self, canon: &HashMap<String, Spur>, value: &str) -> IntSet<SongKey> {
strings canon
.get(value) .get(&sanitize(value))
.and_then(|k| self.exact.get(&k)) .and_then(|s| self.exact.get(&s))
.cloned() .cloned()
.unwrap_or_default() .unwrap_or_default()
} }
@ -305,7 +319,7 @@ mod test {
use super::*; use super::*;
fn setup_test(songs: Vec<scanner::Song>) -> (Search, RodeoReader) { fn setup_test(songs: Vec<scanner::Song>) -> (Search, RodeoReader, HashMap<String, Spur>) {
let mut strings = Rodeo::new(); let mut strings = Rodeo::new();
let mut canon = HashMap::new(); let mut canon = HashMap::new();
@ -317,12 +331,12 @@ mod test {
let search = builder.build(); let search = builder.build();
let strings = strings.into_reader(); let strings = strings.into_reader();
(search, strings) (search, strings, canon)
} }
#[test] #[test]
fn can_find_fuzzy() { fn can_find_fuzzy() {
let (search, strings) = setup_test(vec![ let (search, strings, canon) = setup_test(vec![
scanner::Song { scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"), virtual_path: PathBuf::from("seasons.mp3"),
title: Some("Seasons".to_owned()), title: Some("Seasons".to_owned()),
@ -343,7 +357,7 @@ mod test {
}, },
]); ]);
let songs = search.find_songs(&strings, "agon").unwrap(); let songs = search.find_songs(&strings, &canon, "agon").unwrap();
assert_eq!(songs.len(), 2); assert_eq!(songs.len(), 2);
assert!(songs.contains(&PathBuf::from("seasons.mp3"))); assert!(songs.contains(&PathBuf::from("seasons.mp3")));
@ -352,7 +366,7 @@ mod test {
#[test] #[test]
fn can_find_field_like() { fn can_find_field_like() {
let (search, strings) = setup_test(vec![ let (search, strings, canon) = setup_test(vec![
scanner::Song { scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"), virtual_path: PathBuf::from("seasons.mp3"),
title: Some("Seasons".to_owned()), title: Some("Seasons".to_owned()),
@ -367,15 +381,36 @@ mod test {
}, },
]); ]);
let songs = search.find_songs(&strings, "artist % agon").unwrap(); let songs = search
.find_songs(&strings, &canon, "artist % agon")
.unwrap();
assert_eq!(songs.len(), 1); assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("seasons.mp3"))); assert!(songs.contains(&PathBuf::from("seasons.mp3")));
} }
#[test]
fn text_is_case_insensitive() {
let (search, strings, canon) = setup_test(vec![scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"),
artists: vec!["Dragonforce".to_owned()],
..Default::default()
}]);
let songs = search.find_songs(&strings, &canon, "dragonforce").unwrap();
assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("seasons.mp3")));
let songs = search
.find_songs(&strings, &canon, "artist = dragonforce")
.unwrap();
assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("seasons.mp3")));
}
#[test] #[test]
fn can_find_field_exact() { fn can_find_field_exact() {
let (search, strings) = setup_test(vec![ let (search, strings, canon) = setup_test(vec![
scanner::Song { scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"), virtual_path: PathBuf::from("seasons.mp3"),
title: Some("Seasons".to_owned()), title: Some("Seasons".to_owned()),
@ -390,17 +425,21 @@ mod test {
}, },
]); ]);
let songs = search.find_songs(&strings, "artist = Dragon").unwrap(); let songs = search
.find_songs(&strings, &canon, "artist = Dragon")
.unwrap();
assert!(songs.is_empty()); assert!(songs.is_empty());
let songs = search.find_songs(&strings, "artist = Dragonforce").unwrap(); let songs = search
.find_songs(&strings, &canon, "artist = Dragonforce")
.unwrap();
assert_eq!(songs.len(), 1); assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("seasons.mp3"))); assert!(songs.contains(&PathBuf::from("seasons.mp3")));
} }
#[test] #[test]
fn can_use_and_operator() { fn can_use_and_operator() {
let (search, strings) = setup_test(vec![ let (search, strings, canon) = setup_test(vec![
scanner::Song { scanner::Song {
virtual_path: PathBuf::from("whale.mp3"), virtual_path: PathBuf::from("whale.mp3"),
..Default::default() ..Default::default()
@ -415,18 +454,20 @@ mod test {
}, },
]); ]);
let songs = search.find_songs(&strings, "space && whale").unwrap(); let songs = search
.find_songs(&strings, &canon, "space && whale")
.unwrap();
assert_eq!(songs.len(), 1); assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("whales in space.mp3"))); assert!(songs.contains(&PathBuf::from("whales in space.mp3")));
let songs = search.find_songs(&strings, "space whale").unwrap(); let songs = search.find_songs(&strings, &canon, "space whale").unwrap();
assert_eq!(songs.len(), 1); assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("whales in space.mp3"))); assert!(songs.contains(&PathBuf::from("whales in space.mp3")));
} }
#[test] #[test]
fn can_use_or_operator() { fn can_use_or_operator() {
let (search, strings) = setup_test(vec![ let (search, strings, canon) = setup_test(vec![
scanner::Song { scanner::Song {
virtual_path: PathBuf::from("whale.mp3"), virtual_path: PathBuf::from("whale.mp3"),
..Default::default() ..Default::default()
@ -441,7 +482,9 @@ mod test {
}, },
]); ]);
let songs = search.find_songs(&strings, "space || whale").unwrap(); let songs = search
.find_songs(&strings, &canon, "space || whale")
.unwrap();
assert_eq!(songs.len(), 3); assert_eq!(songs.len(), 3);
assert!(songs.contains(&PathBuf::from("whale.mp3"))); assert!(songs.contains(&PathBuf::from("whale.mp3")));
assert!(songs.contains(&PathBuf::from("space.mp3"))); assert!(songs.contains(&PathBuf::from("space.mp3")));

View file

@ -101,9 +101,19 @@ impl Song {
} }
} }
pub fn sanitize(s: &str) -> String {
// TODO merge inconsistent diacritic usage
let mut cleaned = s.to_owned();
cleaned.retain(|c| match c {
' ' | '_' | '-' | '\'' => false,
_ => true,
});
cleaned.to_lowercase()
}
pub fn store_song( pub fn store_song(
strings: &mut Rodeo, strings: &mut Rodeo,
minuscules: &mut HashMap<String, Spur>, canon: &mut HashMap<String, Spur>,
song: &scanner::Song, song: &scanner::Song,
) -> Option<Song> { ) -> Option<Song> {
let Some(real_path) = (&song.real_path).get_or_intern(strings) else { let Some(real_path) = (&song.real_path).get_or_intern(strings) else {
@ -123,17 +133,12 @@ pub fn store_song(
}; };
let mut canonicalize = |s: &String| { let mut canonicalize = |s: &String| {
let mut cleaned = s.clone(); let cleaned = sanitize(s);
cleaned.retain(|c| match c {
' ' | '_' | '-' | '\'' => false,
_ => true,
});
// TODO merge inconsistent diacritic usage
match cleaned.is_empty() { match cleaned.is_empty() {
true => None, true => None,
false => Some( false => Some(
minuscules canon
.entry(cleaned.to_lowercase()) .entry(cleaned)
.or_insert_with(|| strings.get_or_intern(s)) .or_insert_with(|| strings.get_or_intern(s))
.to_owned(), .to_owned(),
), ),