Search indexing WIP

This commit is contained in:
Antoine Gersant 2024-09-21 19:37:47 -07:00
parent e5339ab39a
commit 5e8587c39f
8 changed files with 370 additions and 76 deletions

View file

@ -106,6 +106,8 @@ pub enum Error {
AlbumNotFound,
#[error("Song not found")]
SongNotFound,
#[error("Invalid search query syntax")]
SearchQueryParseError,
#[error("Playlist not found")]
PlaylistNotFound,
#[error("No embedded artwork was found in `{0}`")]

View file

@ -21,7 +21,7 @@ mod storage;
pub use browser::File;
pub use collection::{Album, AlbumHeader, Artist, ArtistHeader, Song};
use storage::{AlbumKey, ArtistKey, InternPath, SongKey};
use storage::{store_song, AlbumKey, ArtistKey, InternPath, SongKey};
#[derive(Clone)]
pub struct Manager {
@ -254,8 +254,16 @@ impl Manager {
.unwrap()
}
pub async fn search(&self, _query: &str) -> Result<Vec<PathBuf>, Error> {
todo!();
pub async fn search(&self, query: String) -> Result<Vec<PathBuf>, Error> {
spawn_blocking({
let index_manager = self.clone();
move || {
let index = index_manager.index.read().unwrap();
index.search.find_songs(&index.strings, &query)
}
})
.await
.unwrap()
}
}
@ -264,6 +272,7 @@ pub struct Index {
pub strings: RodeoReader,
pub browser: browser::Browser,
pub collection: collection::Collection,
pub search: search::Search,
}
impl Default for Index {
@ -272,6 +281,7 @@ impl Default for Index {
strings: Rodeo::new().into_reader(),
browser: Default::default(),
collection: Default::default(),
search: Default::default(),
}
}
}
@ -281,6 +291,7 @@ pub struct Builder {
minuscules: HashMap<String, Spur>,
browser_builder: browser::Builder,
collection_builder: collection::Builder,
search_builder: search::Builder,
}
impl Builder {
@ -290,6 +301,7 @@ impl Builder {
minuscules: HashMap::default(),
browser_builder: browser::Builder::default(),
collection_builder: collection::Builder::default(),
search_builder: search::Builder::default(),
}
}
@ -298,16 +310,22 @@ impl Builder {
.add_directory(&mut self.strings, directory);
}
pub fn add_song(&mut self, song: scanner::Song) {
self.browser_builder.add_song(&mut self.strings, &song);
self.collection_builder
.add_song(&mut self.strings, &mut self.minuscules, &song);
pub fn add_song(&mut self, scanner_song: scanner::Song) {
if let Some(storage_song) =
store_song(&mut self.strings, &mut self.minuscules, &scanner_song)
{
self.browser_builder
.add_song(&mut self.strings, &scanner_song);
self.collection_builder.add_song(&storage_song);
self.search_builder.add_song(&scanner_song, &storage_song);
}
}
pub fn build(self) -> Index {
Index {
browser: self.browser_builder.build(),
collection: self.collection_builder.build(),
search: self.search_builder.build(),
strings: self.strings.into_reader(),
}
}

View file

@ -4,14 +4,13 @@ use std::{
path::PathBuf,
};
use lasso2::{Rodeo, RodeoReader, Spur};
use lasso2::{RodeoReader, Spur};
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
use serde::{Deserialize, Serialize};
use tinyvec::TinyVec;
use unicase::UniCase;
use crate::app::index::storage::{self, store_song, AlbumKey, ArtistKey, SongKey};
use crate::app::scanner;
use crate::app::index::storage::{self, AlbumKey, ArtistKey, SongKey};
use super::storage::fetch_song;
@ -226,16 +225,7 @@ pub struct Builder {
}
impl Builder {
pub fn add_song(
&mut self,
strings: &mut Rodeo,
minuscules: &mut HashMap<String, Spur>,
song: &scanner::Song,
) {
let Some(song) = store_song(strings, minuscules, song) else {
return;
};
pub fn add_song(&mut self, song: &storage::Song) {
self.add_song_to_album(&song);
self.add_song_to_artists(&song);
@ -243,7 +233,7 @@ impl Builder {
SongKey {
virtual_path: song.virtual_path,
},
song,
song.clone(),
);
}
@ -374,9 +364,12 @@ impl Builder {
#[cfg(test)]
mod test {
use storage::InternPath;
use lasso2::Rodeo;
use tinyvec::tiny_vec;
use crate::app::scanner;
use storage::{store_song, InternPath};
use super::*;
fn setup_test(songs: Vec<scanner::Song>) -> (Collection, RodeoReader) {
@ -385,7 +378,8 @@ mod test {
let mut builder = Builder::default();
for song in songs {
builder.add_song(&mut strings, &mut minuscules, &song);
let song = store_song(&mut strings, &mut minuscules, &song).unwrap();
builder.add_song(&song);
}
let browser = builder.build();

View file

@ -6,8 +6,9 @@ use chumsky::{
text::{int, keyword, whitespace, TextParser},
Parser,
};
use serde::{Deserialize, Serialize};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub enum TextField {
Album,
AlbumArtist,
@ -23,12 +24,10 @@ pub enum TextField {
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum TextOp {
Eq,
NotEq,
Like,
NotLike,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub enum NumberField {
DiscNumber,
TrackNumber,
@ -95,13 +94,7 @@ pub fn make_parser() -> impl Parser<char, Expr, Error = Simple<char>> {
))
.padded();
let text_op = choice((
just("=").to(TextOp::Eq),
just("!=").to(TextOp::NotEq),
just("%").to(TextOp::Like),
just("!%").to(TextOp::NotLike),
))
.padded();
let text_op = choice((just("=").to(TextOp::Eq), just("%").to(TextOp::Like))).padded();
let text_cmp = text_field
.then(text_op)
@ -248,18 +241,10 @@ fn can_parse_text_operators() {
parser.parse(r#"album = "legendary tales""#).unwrap(),
Expr::TextCmp(TextField::Album, TextOp::Eq, "legendary tales".to_owned()),
);
assert_eq!(
parser.parse(r#"album != legendary"#).unwrap(),
Expr::TextCmp(TextField::Album, TextOp::NotEq, "legendary".to_owned()),
);
assert_eq!(
parser.parse(r#"album % "legendary tales""#).unwrap(),
Expr::TextCmp(TextField::Album, TextOp::Like, "legendary tales".to_owned()),
);
assert_eq!(
parser.parse(r#"album !% "legendary""#).unwrap(),
Expr::TextCmp(TextField::Album, TextOp::NotLike, "legendary".to_owned()),
);
}
#[test]

View file

@ -1,50 +1,118 @@
use std::collections::HashSet;
use crate::app::index::{
query::{Expr, Literal, NumberField, NumberOp, TextField, TextOp},
storage::SongKey,
use chumsky::Parser;
use lasso2::{RodeoReader, Spur};
use serde::{Deserialize, Serialize};
use std::{
collections::{HashMap, HashSet},
ffi::OsStr,
path::{Path, PathBuf},
};
use super::query::BoolOp;
use crate::app::{
index::{
query::{BoolOp, Expr, Literal, NumberField, NumberOp, TextField, TextOp},
storage::SongKey,
},
scanner, Error,
};
struct SearchIndex {}
use super::{query::make_parser, storage};
impl SearchIndex {
fn eval_expr(&self, expr: &Expr) -> HashSet<SongKey> {
#[derive(Serialize, Deserialize)]
pub struct Search {
text_fields: HashMap<TextField, TextFieldIndex>,
number_fields: HashMap<NumberField, NumberFieldIndex>,
}
impl Default for Search {
fn default() -> Self {
Self {
text_fields: Default::default(),
number_fields: Default::default(),
}
}
}
impl Search {
pub fn find_songs(&self, strings: &RodeoReader, query: &str) -> Result<Vec<PathBuf>, Error> {
let parser = make_parser();
let parsed_query = parser
.parse(query)
.map_err(|_| Error::SearchQueryParseError)?;
let keys = self.eval(strings, &parsed_query);
Ok(keys
.into_iter()
.map(|k| Path::new(OsStr::new(strings.resolve(&k.virtual_path.0))).to_owned())
.collect::<Vec<_>>())
}
fn eval(&self, strings: &RodeoReader, expr: &Expr) -> HashSet<SongKey> {
match expr {
Expr::Fuzzy(s) => self.eval_fuzzy(s),
Expr::TextCmp(field, op, s) => self.eval_text_operator(*field, *op, &s),
Expr::TextCmp(field, op, s) => self.eval_text_operator(strings, *field, *op, &s),
Expr::NumberCmp(field, op, n) => self.eval_number_operator(*field, *op, *n),
Expr::Combined(e, op, f) => self.combine(e, *op, f),
Expr::Combined(e, op, f) => self.combine(strings, e, *op, f),
}
}
fn combine(&self, e: &Box<Expr>, op: BoolOp, f: &Box<Expr>) -> HashSet<SongKey> {
fn combine(
&self,
strings: &RodeoReader,
e: &Box<Expr>,
op: BoolOp,
f: &Box<Expr>,
) -> HashSet<SongKey> {
match op {
BoolOp::And => self
.eval_expr(e)
.intersection(&self.eval_expr(f))
.eval(strings, e)
.intersection(&self.eval(strings, f))
.cloned()
.collect(),
BoolOp::Or => self
.eval_expr(e)
.union(&self.eval_expr(f))
.eval(strings, e)
.union(&self.eval(strings, f))
.cloned()
.collect(),
}
}
fn eval_fuzzy(&self, value: &Literal) -> HashSet<SongKey> {
HashSet::new()
match value {
Literal::Text(s) => {
let mut songs = HashSet::new();
for field in self.text_fields.values() {
songs.extend(field.find_like(s));
}
songs
}
Literal::Number(n) => {
let mut songs = HashSet::new();
for field in self.number_fields.values() {
songs.extend(field.find_equal(*n));
}
songs
.union(&self.eval_fuzzy(&Literal::Text(n.to_string())))
.copied()
.collect()
}
}
}
fn eval_text_operator(
&self,
strings: &RodeoReader,
field: TextField,
operator: TextOp,
value: &str,
) -> HashSet<SongKey> {
HashSet::new()
let Some(field_index) = self.text_fields.get(&field) else {
return HashSet::new();
};
match operator {
TextOp::Eq => field_index.find_exact(strings, value),
TextOp::Like => field_index.find_like(value),
}
}
fn eval_number_operator(
@ -53,6 +121,243 @@ impl SearchIndex {
operator: NumberOp,
value: i32,
) -> HashSet<SongKey> {
HashSet::new()
todo!()
}
}
const NGRAM_SIZE: usize = 2;
#[derive(Default, Deserialize, Serialize)]
struct TextFieldIndex {
exact: HashMap<Spur, HashSet<SongKey>>,
ngrams: HashMap<[char; NGRAM_SIZE], HashSet<SongKey>>,
}
impl TextFieldIndex {
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {
// TODO sanitize ngrams
let characters = raw_value.chars().collect::<Vec<_>>();
for substring in characters[..].windows(NGRAM_SIZE) {
self.ngrams
.entry(substring.try_into().unwrap())
.or_default()
.insert(key);
}
self.exact.entry(value).or_default().insert(key);
}
pub fn find_like(&self, value: &str) -> HashSet<SongKey> {
let characters = value.chars().collect::<Vec<_>>();
let mut candidates = characters[..]
.windows(NGRAM_SIZE)
.filter_map(|s| self.ngrams.get::<[char; NGRAM_SIZE]>(s.try_into().unwrap()))
.collect::<Vec<_>>();
if candidates.is_empty() {
return HashSet::new();
}
candidates.sort_by_key(|h| h.len());
candidates[0]
.iter()
.filter(move |c| candidates[1..].iter().all(|s| s.contains(c)))
.copied()
.collect()
}
pub fn find_exact(&self, strings: &RodeoReader, value: &str) -> HashSet<SongKey> {
strings
.get(value)
.and_then(|k| self.exact.get(&k))
.cloned()
.unwrap_or_default()
}
}
#[derive(Default, Deserialize, Serialize)]
struct NumberFieldIndex {
values: HashMap<i32, HashSet<SongKey>>,
}
impl NumberFieldIndex {
pub fn insert(&mut self, raw_value: &str, value: Spur, key: SongKey) {}
pub fn find_equal(&self, value: i32) -> HashSet<SongKey> {
todo!()
}
}
#[derive(Default)]
pub struct Builder {
text_fields: HashMap<TextField, TextFieldIndex>,
number_fields: HashMap<NumberField, NumberFieldIndex>,
}
impl Builder {
pub fn add_song(&mut self, scanner_song: &scanner::Song, storage_song: &storage::Song) {
let song_key = SongKey {
virtual_path: storage_song.virtual_path,
};
if let (Some(str), Some(spur)) = (&scanner_song.album, storage_song.album) {
self.text_fields
.entry(TextField::Album)
.or_default()
.insert(str, spur, song_key);
}
for (str, spur) in scanner_song
.album_artists
.iter()
.zip(storage_song.album_artists.iter())
{
self.text_fields
.entry(TextField::AlbumArtist)
.or_default()
.insert(str, *spur, song_key);
}
for (str, spur) in scanner_song.artists.iter().zip(storage_song.artists.iter()) {
self.text_fields
.entry(TextField::Artist)
.or_default()
.insert(str, *spur, song_key);
}
for (str, spur) in scanner_song
.composers
.iter()
.zip(storage_song.composers.iter())
{
self.text_fields
.entry(TextField::Composer)
.or_default()
.insert(str, *spur, song_key);
}
for (str, spur) in scanner_song.genres.iter().zip(storage_song.genres.iter()) {
self.text_fields
.entry(TextField::Genre)
.or_default()
.insert(str, *spur, song_key);
}
for (str, spur) in scanner_song.labels.iter().zip(storage_song.labels.iter()) {
self.text_fields
.entry(TextField::Label)
.or_default()
.insert(str, *spur, song_key);
}
for (str, spur) in scanner_song
.lyricists
.iter()
.zip(storage_song.lyricists.iter())
{
self.text_fields
.entry(TextField::Lyricist)
.or_default()
.insert(str, *spur, song_key);
}
self.text_fields.entry(TextField::Path).or_default().insert(
scanner_song.virtual_path.to_string_lossy().as_ref(),
storage_song.virtual_path.0,
song_key,
);
if let (Some(str), Some(spur)) = (&scanner_song.title, storage_song.title) {
self.text_fields
.entry(TextField::Title)
.or_default()
.insert(str, spur, song_key);
}
}
pub fn build(self) -> Search {
Search {
text_fields: self.text_fields,
number_fields: self.number_fields,
}
}
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
use lasso2::Rodeo;
use storage::store_song;
use super::*;
fn setup_test(songs: Vec<scanner::Song>) -> (Search, RodeoReader) {
let mut strings = Rodeo::new();
let mut canon = HashMap::new();
let mut builder = Builder::default();
for song in songs {
let storage_song = store_song(&mut strings, &mut canon, &song).unwrap();
builder.add_song(&song, &storage_song);
}
let search = builder.build();
let strings = strings.into_reader();
(search, strings)
}
#[test]
fn can_find_fuzzy() {
let (search, strings) = setup_test(vec![
scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"),
title: Some("Seasons".to_owned()),
artists: vec!["Dragonforce".to_owned()],
..Default::default()
},
scanner::Song {
virtual_path: PathBuf::from("potd.mp3"),
title: Some("Power of the Dragonflame".to_owned()),
artists: vec!["Rhapsody".to_owned()],
..Default::default()
},
scanner::Song {
virtual_path: PathBuf::from("calcium.mp3"),
title: Some("Calcium".to_owned()),
artists: vec!["FSOL".to_owned()],
..Default::default()
},
]);
let songs = search.find_songs(&strings, "agon").unwrap();
assert_eq!(songs.len(), 2);
assert!(songs.contains(&PathBuf::from("seasons.mp3")));
assert!(songs.contains(&PathBuf::from("potd.mp3")));
}
#[test]
fn can_find_field_like() {
let (search, strings) = setup_test(vec![
scanner::Song {
virtual_path: PathBuf::from("seasons.mp3"),
title: Some("Seasons".to_owned()),
artists: vec!["Dragonforce".to_owned()],
..Default::default()
},
scanner::Song {
virtual_path: PathBuf::from("potd.mp3"),
title: Some("Power of the Dragonflame".to_owned()),
artists: vec!["Rhapsody".to_owned()],
..Default::default()
},
]);
let songs = search.find_songs(&strings, "artist % agon").unwrap();
assert_eq!(songs.len(), 1);
assert!(songs.contains(&PathBuf::from("seasons.mp3")));
}
}

View file

@ -62,7 +62,6 @@ pub fn router() -> Router<App> {
.route("/random", get(get_random_albums)) // Deprecated
.route("/recent", get(get_recent_albums)) // Deprecated
// Search
.route("/search", get(get_search_root))
.route("/search/*query", get(get_search))
// Playlist management
.route("/playlists", get(get_playlists))
@ -507,26 +506,13 @@ async fn get_recent_albums(
albums_to_response(albums, api_version)
}
async fn get_search_root(
_auth: Auth,
api_version: APIMajorVersion,
State(index_manager): State<index::Manager>,
) -> Response {
let paths = match index_manager.search("").await {
Ok(f) => f,
Err(e) => return APIError::from(e).into_response(),
};
let song_list = make_song_list(paths, &index_manager).await;
song_list_to_response(song_list, api_version)
}
async fn get_search(
_auth: Auth,
api_version: APIMajorVersion,
State(index_manager): State<index::Manager>,
Path(query): Path<String>, // TODO return dto::SongList
) -> Response {
let paths = match index_manager.search(&query).await {
let paths = match index_manager.search(query).await {
Ok(f) => f,
Err(e) => return APIError::from(e).into_response(),
};

View file

@ -39,6 +39,7 @@ impl IntoResponse for APIError {
APIError::OwnAdminPrivilegeRemoval => StatusCode::CONFLICT,
APIError::PasswordHashing => StatusCode::INTERNAL_SERVER_ERROR,
APIError::PlaylistNotFound => StatusCode::NOT_FOUND,
APIError::SearchQueryParseError => StatusCode::BAD_REQUEST,
APIError::ThumbnailFlacDecoding(_, _) => StatusCode::INTERNAL_SERVER_ERROR,
APIError::ThumbnailFileIOError => StatusCode::NOT_FOUND,
APIError::ThumbnailId3Decoding(_, _) => StatusCode::INTERNAL_SERVER_ERROR,

View file

@ -65,6 +65,8 @@ pub enum APIError {
PasswordHashing,
#[error("Playlist not found")]
PlaylistNotFound,
#[error("Could not parse search query")]
SearchQueryParseError,
#[error("Could not decode thumbnail from flac file `{0}`:\n\n{1}")]
ThumbnailFlacDecoding(PathBuf, metaflac::Error),
#[error("Thumbnail file could not be opened")]
@ -137,6 +139,7 @@ impl From<app::Error> for APIError {
app::Error::AlbumNotFound => APIError::AlbumNotFound,
app::Error::SongNotFound => APIError::SongNotFound,
app::Error::PlaylistNotFound => APIError::PlaylistNotFound,
app::Error::SearchQueryParseError => APIError::SearchQueryParseError,
app::Error::EmbeddedArtworkNotFound(_) => APIError::EmbeddedArtworkNotFound,
app::Error::EmptyUsername => APIError::EmptyUsername,