mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Speed-up the MatchingWords highlighting struct
This commit is contained in:
		| @@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; | ||||
| pub use self::index::Index; | ||||
| pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; | ||||
| pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; | ||||
| pub use self::update_store::UpdateStore; | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
|   | ||||
| @@ -1,8 +1,8 @@ | ||||
| use std::{borrow::Cow, collections::HashMap, mem::take}; | ||||
|  | ||||
| use anyhow::bail; | ||||
| use roaring::RoaringBitmap; | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; | ||||
| use crate::search::word_derivations; | ||||
|   | ||||
| @@ -1,10 +1,9 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::fmt; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use fst::{IntoStreamer, Streamer, Set}; | ||||
| use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; | ||||
| use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; | ||||
| use log::debug; | ||||
| use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; | ||||
| use once_cell::sync::Lazy; | ||||
| @@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult}; | ||||
| use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; | ||||
| use crate::{Index, DocumentId}; | ||||
|  | ||||
| pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; | ||||
| pub use self::facet::FacetIter; | ||||
| pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; | ||||
| pub use self::query_tree::MatchingWords; | ||||
| use self::query_tree::QueryTreeBuilder; | ||||
|  | ||||
| // Building these factories is not free. | ||||
| @@ -87,6 +87,11 @@ impl<'a> Search<'a> { | ||||
|  | ||||
|         debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); | ||||
|  | ||||
|         let matching_words = match query_tree.as_ref() { | ||||
|             Some(query_tree) => MatchingWords::from_query_tree(&query_tree), | ||||
|             None => MatchingWords::default(), | ||||
|         }; | ||||
|  | ||||
|         // We are testing the typo criteria but there will be more of them soon. | ||||
|         let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; | ||||
|         let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; | ||||
| @@ -128,8 +133,7 @@ impl<'a> Search<'a> { | ||||
|             if limit == 0 { break } | ||||
|         } | ||||
|  | ||||
|         let found_words = HashSet::new(); | ||||
|         Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) | ||||
|         Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> { | ||||
|  | ||||
| #[derive(Default)] | ||||
| pub struct SearchResult { | ||||
|     pub found_words: HashSet<String>, | ||||
|     pub matching_words: MatchingWords, | ||||
|     pub candidates: RoaringBitmap, | ||||
|     // TODO those documents ids should be associated with their criteria scores. | ||||
|     pub documents_ids: Vec<DocumentId>, | ||||
| } | ||||
|  | ||||
| pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> { | ||||
|     let lev = match max_typo { | ||||
|         0 => &LEVDIST0, | ||||
|         1 => &LEVDIST1, | ||||
|         _ => &LEVDIST2, | ||||
|     }; | ||||
|  | ||||
|     let dfa = if is_prefix { | ||||
|         lev.build_prefix_dfa(&word) | ||||
|     } else { | ||||
|         lev.build_dfa(&word) | ||||
|     }; | ||||
|  | ||||
| pub fn word_derivations( | ||||
|     word: &str, | ||||
|     is_prefix: bool, | ||||
|     max_typo: u8, | ||||
|     fst: &fst::Set<Cow<[u8]>>, | ||||
| ) -> anyhow::Result<Vec<(String, u8)>> | ||||
| { | ||||
|     let mut derived_words = Vec::new(); | ||||
|     let dfa = build_dfa(word, max_typo, is_prefix); | ||||
|     let mut stream = fst.search_with_state(&dfa).into_stream(); | ||||
|  | ||||
|     while let Some((word, state)) = stream.next() { | ||||
| @@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se | ||||
|  | ||||
|     Ok(derived_words) | ||||
| } | ||||
|  | ||||
| pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { | ||||
|     let lev = match typos { | ||||
|         0 => &LEVDIST0, | ||||
|         1 => &LEVDIST1, | ||||
|         _ => &LEVDIST2, | ||||
|     }; | ||||
|  | ||||
|     if is_prefix { | ||||
|         lev.build_prefix_dfa(word) | ||||
|     } else { | ||||
|         lev.build_dfa(word) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,12 +1,13 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::BTreeMap; | ||||
| use std::collections::HashSet; | ||||
| use std::{fmt, cmp, mem}; | ||||
|  | ||||
| use levenshtein_automata::{DFA, Distance}; | ||||
| use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; | ||||
| use roaring::RoaringBitmap; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::Index; | ||||
| use super::build_dfa; | ||||
|  | ||||
| type IsOptionalWord = bool; | ||||
| type IsPrefix = bool; | ||||
| @@ -113,6 +114,14 @@ impl QueryKind { | ||||
|         QueryKind::Tolerant { typo, word } | ||||
|     } | ||||
|  | ||||
|     pub fn is_tolerant(&self) -> bool { | ||||
|         matches!(self, QueryKind::Tolerant { .. }) | ||||
|     } | ||||
|  | ||||
|     pub fn is_exact(&self) -> bool { | ||||
|         matches!(self, QueryKind::Exact { .. }) | ||||
|     } | ||||
|  | ||||
|     pub fn typo(&self) -> u8 { | ||||
|         match self { | ||||
|             QueryKind::Tolerant { typo, .. } => *typo, | ||||
| @@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat | ||||
| } | ||||
|  | ||||
| /// The query tree builder is the interface to build a query tree. | ||||
| #[derive(Default)] | ||||
| pub struct MatchingWords { | ||||
|     inner: BTreeMap<String, IsPrefix> | ||||
|     dfas: Vec<(DFA, u8)>, | ||||
| } | ||||
|  | ||||
| impl MatchingWords { | ||||
|     /// List all words which can be considered as a match for the query tree. | ||||
|     pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self { | ||||
|         Self { inner: fetch_words(tree, fst).into_iter().collect() } | ||||
|     pub fn from_query_tree(tree: &Operation) -> Self { | ||||
|         Self { | ||||
|             dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Return true if the word match. | ||||
|     pub fn is_match(&self, word: &str) -> bool { | ||||
|         fn first_char(s: &str) -> Option<&str> { | ||||
|             s.chars().next().map(|c| &s[..c.len_utf8()]) | ||||
|         } | ||||
|  | ||||
|         match first_char(word) { | ||||
|             Some(first) => { | ||||
|                 let left = first.to_owned(); | ||||
|                 let right = word.to_owned(); | ||||
|                 self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) | ||||
|             }, | ||||
|             None => false | ||||
|         } | ||||
|     pub fn matches(&self, word: &str) -> bool { | ||||
|         self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { | ||||
|             Distance::Exact(t) => t <= *typo, | ||||
|             Distance::AtLeast(_) => false, | ||||
|         }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| type FetchedWords = Vec<(String, IsPrefix)>; | ||||
|  | ||||
| /// Lists all words which can be considered as a match for the query tree. | ||||
| fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { | ||||
|     fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { | ||||
|         tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() | ||||
|     } | ||||
|  | ||||
|     fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { | ||||
|         match query.kind.clone() { | ||||
|             QueryKind::Exact { word, .. } => vec![(word, query.prefix)], | ||||
|             QueryKind::Tolerant { typo, word } => { | ||||
|                 if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { | ||||
|                     words.into_iter().map(|(w, _)| (w, query.prefix)).collect() | ||||
|                 } else { | ||||
|                     vec![(word, query.prefix)] | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { | ||||
| fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { | ||||
|     fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { | ||||
|         match tree { | ||||
|             Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { | ||||
|                 resolve_branch(ops.as_slice(), fst) | ||||
|                 ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); | ||||
|             }, | ||||
|             Operation::Query(ops) => { | ||||
|                 resolve_query(ops, fst) | ||||
|             Operation::Query(Query { prefix, kind }) => { | ||||
|                 let typo = if kind.is_exact() { 0 } else { kind.typo() }; | ||||
|                 out.insert((kind.word(), typo, *prefix)); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut words = resolve_ops(tree, fst); | ||||
|     words.sort_unstable(); | ||||
|     words.dedup(); | ||||
|     words | ||||
|     let mut queries = HashSet::new(); | ||||
|     resolve_ops(tree, &mut queries); | ||||
|     queries | ||||
| } | ||||
|  | ||||
| /// Main function that creates the final query tree from the primitive query. | ||||
| @@ -559,7 +544,7 @@ mod test { | ||||
|     use std::collections::HashMap; | ||||
|  | ||||
|     use fst::Set; | ||||
|     use maplit::hashmap; | ||||
|     use maplit::{hashmap, hashset}; | ||||
|     use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | ||||
|     use rand::{Rng, SeedableRng, rngs::StdRng}; | ||||
|  | ||||
| @@ -970,26 +955,26 @@ mod test { | ||||
|         let context = TestContext::default(); | ||||
|         let query_tree = context.build(false, true, tokens).unwrap().unwrap(); | ||||
|  | ||||
|         let expected = vec![ | ||||
|             ("city".to_string(), false), | ||||
|             ("earth".to_string(), false), | ||||
|             ("nature".to_string(), false), | ||||
|             ("new".to_string(), false), | ||||
|             ("nyc".to_string(), false), | ||||
|             ("split".to_string(), false), | ||||
|             ("word".to_string(), false), | ||||
|             ("word".to_string(), true), | ||||
|             ("world".to_string(), true), | ||||
|             ("york".to_string(), false), | ||||
|  | ||||
|         ]; | ||||
|         let expected = hashset!{ | ||||
|             ("word",                0, false), | ||||
|             ("nyc",                 0, false), | ||||
|             ("wordsplit",           2, false), | ||||
|             ("wordsplitnycworld",   2, true), | ||||
|             ("nature",              0, false), | ||||
|             ("new",                 0, false), | ||||
|             ("city",                0, false), | ||||
|             ("world",               1, true), | ||||
|             ("york",                0, false), | ||||
|             ("split",               0, false), | ||||
|             ("nycworld",            1, true), | ||||
|             ("earth",               0, false), | ||||
|             ("wordsplitnyc",        2, false), | ||||
|         }; | ||||
|  | ||||
|         let mut keys = context.postings.keys().collect::<Vec<_>>(); | ||||
|         keys.sort_unstable(); | ||||
|         let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); | ||||
|  | ||||
|         let words = fetch_words(&query_tree, &set); | ||||
|  | ||||
|         let words = fetch_queries(&query_tree); | ||||
|         assert_eq!(expected, words); | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user