mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Introduce bucket_sort_with_distinct function
This commit is contained in:
		| @@ -1,17 +1,7 @@ | ||||
| mod dfa; | ||||
| mod query_enhancer; | ||||
|  | ||||
| use std::cmp::Reverse; | ||||
| use std::{cmp, fmt, vec}; | ||||
|  | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use levenshtein_automata::DFA; | ||||
| use meilisearch_tokenizer::{is_cjk, split_query_string}; | ||||
| use log::debug; | ||||
|  | ||||
| use crate::database::MainT; | ||||
| use crate::error::MResult; | ||||
| use crate::store; | ||||
| use meilisearch_tokenizer::is_cjk; | ||||
|  | ||||
| pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||
| pub use self::query_enhancer::QueryEnhancer; | ||||
| @@ -19,122 +9,6 @@ pub use self::query_enhancer::QueryEnhancerBuilder; | ||||
|  | ||||
| pub const NGRAMS: usize = 3; | ||||
|  | ||||
| pub struct AutomatonProducer { | ||||
|     automatons: Vec<AutomatonGroup>, | ||||
| } | ||||
|  | ||||
| impl AutomatonProducer { | ||||
|     pub fn new( | ||||
|         reader: &heed::RoTxn<MainT>, | ||||
|         query: &str, | ||||
|         main_store: store::Main, | ||||
|         postings_list_store: store::PostingsLists, | ||||
|         synonyms_store: store::Synonyms, | ||||
|     ) -> MResult<(AutomatonProducer, QueryEnhancer)> { | ||||
|         let (automatons, query_enhancer) = generate_automatons( | ||||
|             reader, | ||||
|             query, | ||||
|             main_store, | ||||
|             postings_list_store, | ||||
|             synonyms_store, | ||||
|         )?; | ||||
|  | ||||
|         for (i, group) in automatons.iter().enumerate() { | ||||
|             debug!("all automatons: group {} automatons {:?}", i, group.automatons); | ||||
|         } | ||||
|  | ||||
|         Ok((AutomatonProducer { automatons }, query_enhancer)) | ||||
|     } | ||||
|  | ||||
|     pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> { | ||||
|         self.automatons.into_iter() | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct AutomatonGroup { | ||||
|     pub is_phrase_query: bool, | ||||
|     pub automatons: Vec<Automaton>, | ||||
| } | ||||
|  | ||||
| impl AutomatonGroup { | ||||
|     fn normal(automatons: Vec<Automaton>) -> AutomatonGroup { | ||||
|         AutomatonGroup { | ||||
|             is_phrase_query: false, | ||||
|             automatons, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup { | ||||
|         AutomatonGroup { | ||||
|             is_phrase_query: true, | ||||
|             automatons, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Automaton { | ||||
|     pub index: usize, | ||||
|     pub ngram: usize, | ||||
|     pub query_len: usize, | ||||
|     pub is_exact: bool, | ||||
|     pub is_prefix: bool, | ||||
|     pub query: String, | ||||
| } | ||||
|  | ||||
| impl fmt::Debug for Automaton { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||
|         f.debug_struct("Automaton") | ||||
|             .field("index", &self.index) | ||||
|             .field("query", &self.query) | ||||
|             .field("is_prefix", &self.is_prefix) | ||||
|             .finish() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Automaton { | ||||
|     pub fn dfa(&self) -> DFA { | ||||
|         if self.is_prefix { | ||||
|             build_prefix_dfa(&self.query) | ||||
|         } else { | ||||
|             build_dfa(&self.query) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn exact(index: usize, ngram: usize, query: &str) -> Automaton { | ||||
|         Automaton { | ||||
|             index, | ||||
|             ngram, | ||||
|             query_len: query.len(), | ||||
|             is_exact: true, | ||||
|             is_prefix: false, | ||||
|             query: query.to_string(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { | ||||
|         Automaton { | ||||
|             index, | ||||
|             ngram, | ||||
|             query_len: query.len(), | ||||
|             is_exact: true, | ||||
|             is_prefix: true, | ||||
|             query: query.to_string(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { | ||||
|         Automaton { | ||||
|             index, | ||||
|             ngram, | ||||
|             query_len: query.len(), | ||||
|             is_exact: false, | ||||
|             is_prefix: false, | ||||
|             query: query.to_string(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn normalize_str(string: &str) -> String { | ||||
|     let mut string = string.to_lowercase(); | ||||
|  | ||||
| @@ -144,167 +18,3 @@ pub fn normalize_str(string: &str) -> String { | ||||
|  | ||||
|     string | ||||
| } | ||||
|  | ||||
| pub fn split_best_frequency<'a>( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     word: &'a str, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
| ) -> MResult<Option<(&'a str, &'a str)>> { | ||||
|     let chars = word.char_indices().skip(1); | ||||
|     let mut best = None; | ||||
|  | ||||
|     for (i, _) in chars { | ||||
|         let (left, right) = word.split_at(i); | ||||
|  | ||||
|         let left_freq = postings_lists_store | ||||
|             .postings_list(reader, left.as_ref())? | ||||
|             .map_or(0, |i| i.len()); | ||||
|  | ||||
|         let right_freq = postings_lists_store | ||||
|             .postings_list(reader, right.as_ref())? | ||||
|             .map_or(0, |i| i.len()); | ||||
|  | ||||
|         let min_freq = cmp::min(left_freq, right_freq); | ||||
|         if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { | ||||
|             best = Some((min_freq, left, right)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(best.map(|(_, l, r)| (l, r))) | ||||
| } | ||||
|  | ||||
| fn generate_automatons( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     query: &str, | ||||
|     main_store: store::Main, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
|     synonym_store: store::Synonyms, | ||||
| ) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> { | ||||
|     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); | ||||
|     let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); | ||||
|     let synonyms = match main_store.synonyms_fst(reader)? { | ||||
|         Some(synonym) => synonym, | ||||
|         None => fst::Set::default(), | ||||
|     }; | ||||
|  | ||||
|     let mut automaton_index = 0; | ||||
|     let mut automatons = Vec::new(); | ||||
|     let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); | ||||
|  | ||||
|     // We must not declare the original words to the query enhancer | ||||
|     // *but* we need to push them in the automatons list first | ||||
|     let mut original_automatons = Vec::new(); | ||||
|     let mut original_words = query_words.iter().peekable(); | ||||
|     while let Some(word) = original_words.next() { | ||||
|         let has_following_word = original_words.peek().is_some(); | ||||
|         let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); | ||||
|  | ||||
|         let automaton = if not_prefix_dfa { | ||||
|             Automaton::exact(automaton_index, 1, word) | ||||
|         } else { | ||||
|             Automaton::prefix_exact(automaton_index, 1, word) | ||||
|         }; | ||||
|         automaton_index += 1; | ||||
|         original_automatons.push(automaton); | ||||
|     } | ||||
|  | ||||
|     automatons.push(AutomatonGroup::normal(original_automatons)); | ||||
|  | ||||
|     for n in 1..=NGRAMS { | ||||
|         let mut ngrams = query_words.windows(n).enumerate().peekable(); | ||||
|         while let Some((query_index, ngram_slice)) = ngrams.next() { | ||||
|             let query_range = query_index..query_index + n; | ||||
|             let ngram_nb_words = ngram_slice.len(); | ||||
|             let ngram = ngram_slice.join(" "); | ||||
|  | ||||
|             let has_following_word = ngrams.peek().is_some(); | ||||
|             let not_prefix_dfa = | ||||
|                 has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); | ||||
|  | ||||
|             // automaton of synonyms of the ngrams | ||||
|             let normalized = normalize_str(&ngram); | ||||
|             let lev = if not_prefix_dfa { | ||||
|                 build_dfa(&normalized) | ||||
|             } else { | ||||
|                 build_prefix_dfa(&normalized) | ||||
|             }; | ||||
|  | ||||
|             let mut stream = synonyms.search(&lev).into_stream(); | ||||
|             while let Some(base) = stream.next() { | ||||
|                 // only trigger alternatives when the last word has been typed | ||||
|                 // i.e. "new " do not but "new yo" triggers alternatives to "new york" | ||||
|                 let base = std::str::from_utf8(base).unwrap(); | ||||
|                 let base_nb_words = split_query_string(base).count(); | ||||
|                 if ngram_nb_words != base_nb_words { | ||||
|                     continue; | ||||
|                 } | ||||
|  | ||||
|                 if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { | ||||
|                     let mut stream = synonyms.into_stream(); | ||||
|                     while let Some(synonyms) = stream.next() { | ||||
|                         let synonyms = std::str::from_utf8(synonyms).unwrap(); | ||||
|                         let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); | ||||
|                         let nb_synonym_words = synonyms_words.len(); | ||||
|  | ||||
|                         let real_query_index = automaton_index; | ||||
|                         enhancer_builder.declare( | ||||
|                             query_range.clone(), | ||||
|                             real_query_index, | ||||
|                             &synonyms_words, | ||||
|                         ); | ||||
|  | ||||
|                         for synonym in synonyms_words { | ||||
|                             let automaton = if nb_synonym_words == 1 { | ||||
|                                 Automaton::exact(automaton_index, n, synonym) | ||||
|                             } else { | ||||
|                                 Automaton::non_exact(automaton_index, n, synonym) | ||||
|                             }; | ||||
|                             automaton_index += 1; | ||||
|                             automatons.push(AutomatonGroup::normal(vec![automaton])); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             if n == 1 { | ||||
|                 if let Some((left, right)) = | ||||
|                     split_best_frequency(reader, &normalized, postings_lists_store)? | ||||
|                 { | ||||
|                     let a = Automaton::exact(automaton_index, 1, left); | ||||
|                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); | ||||
|                     automaton_index += 1; | ||||
|  | ||||
|                     let b = Automaton::exact(automaton_index, 1, right); | ||||
|                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); | ||||
|                     automaton_index += 1; | ||||
|  | ||||
|                     automatons.push(AutomatonGroup::phrase_query(vec![a, b])); | ||||
|                 } | ||||
|             } else { | ||||
|                 // automaton of concatenation of query words | ||||
|                 let concat = ngram_slice.concat(); | ||||
|                 let normalized = normalize_str(&concat); | ||||
|  | ||||
|                 let real_query_index = automaton_index; | ||||
|                 enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); | ||||
|  | ||||
|                 let automaton = Automaton::exact(automaton_index, n, &normalized); | ||||
|                 automaton_index += 1; | ||||
|                 automatons.push(AutomatonGroup::normal(vec![automaton])); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // order automatons, the most important first, | ||||
|     // we keep the original automatons at the front. | ||||
|     automatons[1..].sort_by_key(|group| { | ||||
|         let a = group.automatons.first().unwrap(); | ||||
|         ( | ||||
|             Reverse(a.is_exact), | ||||
|             a.ngram, | ||||
|             Reverse(group.automatons.len()), | ||||
|         ) | ||||
|     }); | ||||
|  | ||||
|     Ok((automatons, enhancer_builder.build())) | ||||
| } | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| use std::ops::Deref; | ||||
| use std::fmt; | ||||
| use std::{cmp, fmt}; | ||||
| use std::borrow::Cow; | ||||
| use std::mem; | ||||
| use std::ops::Range; | ||||
| @@ -8,43 +8,68 @@ use std::time::{Duration, Instant}; | ||||
|  | ||||
| use compact_arena::{SmallArena, Idx32, mk_arena}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use hashbrown::HashMap; | ||||
| use levenshtein_automata::DFA; | ||||
| use log::debug; | ||||
| use meilisearch_tokenizer::{is_cjk, split_query_string}; | ||||
| use meilisearch_types::{DocIndex, Highlight}; | ||||
| use meilisearch_types::DocIndex; | ||||
| use sdset::{Set, SetBuf}; | ||||
| use slice_group_by::{GroupBy, GroupByMut}; | ||||
|  | ||||
| use crate::automaton::NGRAMS; | ||||
| use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; | ||||
| use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||
| use crate::automaton::{normalize_str, split_best_frequency}; | ||||
| use crate::automaton::normalize_str; | ||||
| use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; | ||||
|  | ||||
| use crate::criterion::Criteria; | ||||
| use crate::levenshtein::prefix_damerau_levenshtein; | ||||
| use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; | ||||
| use crate::raw_document::RawDocument; | ||||
| use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; | ||||
| use crate::{store, Document, DocumentId, MResult}; | ||||
|  | ||||
| pub fn bucket_sort<'c>( | ||||
| pub fn bucket_sort<'c, FI>( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     query: &str, | ||||
|     range: Range<usize>, | ||||
|     filter: Option<FI>, | ||||
|     criteria: Criteria<'c>, | ||||
|     main_store: store::Main, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
|     documents_fields_counts_store: store::DocumentsFieldsCounts, | ||||
|     synonyms_store: store::Synonyms, | ||||
| ) -> MResult<Vec<Document>> | ||||
| where | ||||
|     FI: Fn(DocumentId) -> bool, | ||||
| { | ||||
|     // We delegate the filter work to the distinct query builder, | ||||
|     // specifying a distinct rule that has no effect. | ||||
|     if filter.is_some() { | ||||
|         let distinct = |_| None; | ||||
|         let distinct_size = 1; | ||||
|         return bucket_sort_with_distinct( | ||||
|             reader, | ||||
|             query, | ||||
|             range, | ||||
|             filter, | ||||
|             distinct, | ||||
|             distinct_size, | ||||
|             criteria, | ||||
|             main_store, | ||||
|             postings_lists_store, | ||||
|             documents_fields_counts_store, | ||||
|             synonyms_store, | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     let (automatons, query_enhancer) = | ||||
|         construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; | ||||
|         construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; | ||||
|  | ||||
|     debug!("{:?}", query_enhancer); | ||||
|  | ||||
|     let before_postings_lists_fetching = Instant::now(); | ||||
|     mk_arena!(arena); | ||||
|     let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; | ||||
|     let mut bare_matches = | ||||
|         fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; | ||||
|     debug!("bare matches ({}) retrieved in {:.02?}", | ||||
|         bare_matches.len(), | ||||
|         before_postings_lists_fetching.elapsed(), | ||||
| @@ -69,9 +94,6 @@ pub fn bucket_sort<'c>( | ||||
|         before_raw_documents_building.elapsed(), | ||||
|     ); | ||||
|  | ||||
|     dbg!(mem::size_of::<BareMatch>()); | ||||
|     dbg!(mem::size_of::<SimpleMatch>()); | ||||
|  | ||||
|     let mut groups = vec![raw_documents.as_mut_slice()]; | ||||
|  | ||||
|     'criteria: for criterion in criteria.as_ref() { | ||||
| @@ -103,31 +125,166 @@ pub fn bucket_sort<'c>( | ||||
|     } | ||||
|  | ||||
|     let iter = raw_documents.into_iter().skip(range.start).take(range.len()); | ||||
|     let iter = iter.map(|d| { | ||||
|         let highlights = d.raw_matches.iter().flat_map(|sm| { | ||||
|             let postings_list = &arena[sm.postings_list]; | ||||
|             let input = postings_list.input(); | ||||
|             let query = &automatons[sm.query_index as usize].query; | ||||
|             postings_list.iter().map(move |m| { | ||||
|                 let covered_area = if query.len() > input.len() { | ||||
|                     input.len() | ||||
|                 } else { | ||||
|                     prefix_damerau_levenshtein(query.as_bytes(), input).1 | ||||
|                 }; | ||||
|                 Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 } | ||||
|             }) | ||||
|         }).collect(); | ||||
|  | ||||
|         Document { | ||||
|             id: d.id, | ||||
|             highlights, | ||||
|             #[cfg(test)] matches: Vec::new(), | ||||
|         } | ||||
|     }); | ||||
|     let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena)); | ||||
|  | ||||
|     Ok(iter.collect()) | ||||
| } | ||||
|  | ||||
| pub fn bucket_sort_with_distinct<'c, FI, FD>( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     query: &str, | ||||
|     range: Range<usize>, | ||||
|     filter: Option<FI>, | ||||
|     distinct: FD, | ||||
|     distinct_size: usize, | ||||
|     criteria: Criteria<'c>, | ||||
|     main_store: store::Main, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
|     documents_fields_counts_store: store::DocumentsFieldsCounts, | ||||
|     synonyms_store: store::Synonyms, | ||||
| ) -> MResult<Vec<Document>> | ||||
| where | ||||
|     FI: Fn(DocumentId) -> bool, | ||||
|     FD: Fn(DocumentId) -> Option<u64>, | ||||
| { | ||||
|     let (automatons, query_enhancer) = | ||||
|         construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; | ||||
|  | ||||
|     let before_postings_lists_fetching = Instant::now(); | ||||
|     mk_arena!(arena); | ||||
|     let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; | ||||
|     debug!("bare matches ({}) retrieved in {:.02?}", | ||||
|         bare_matches.len(), | ||||
|         before_postings_lists_fetching.elapsed(), | ||||
|     ); | ||||
|  | ||||
|     let before_raw_documents_presort = Instant::now(); | ||||
|     bare_matches.sort_unstable_by_key(|sm| sm.document_id); | ||||
|     debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); | ||||
|  | ||||
|     let before_raw_documents_building = Instant::now(); | ||||
|     let mut prefiltered_documents = 0; | ||||
|     let mut raw_documents = Vec::new(); | ||||
|     for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { | ||||
|         prefiltered_documents += 1; | ||||
|         if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { | ||||
|             raw_documents.push(raw_document); | ||||
|         } | ||||
|     } | ||||
|     debug!("creating {} (original {}) candidates documents took {:.02?}", | ||||
|         raw_documents.len(), | ||||
|         prefiltered_documents, | ||||
|         before_raw_documents_building.elapsed(), | ||||
|     ); | ||||
|  | ||||
|     let mut groups = vec![raw_documents.as_mut_slice()]; | ||||
|     let mut key_cache = HashMap::new(); | ||||
|  | ||||
|     let mut filter_map = HashMap::new(); | ||||
|     // these two variables informs on the current distinct map and | ||||
|     // on the raw offset of the start of the group where the | ||||
|     // range.start bound is located according to the distinct function | ||||
|     let mut distinct_map = DistinctMap::new(distinct_size); | ||||
|     let mut distinct_raw_offset = 0; | ||||
|  | ||||
|     'criteria: for criterion in criteria.as_ref() { | ||||
|         let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||
|         let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); | ||||
|         let mut documents_seen = 0; | ||||
|  | ||||
|         for mut group in tmp_groups { | ||||
|             // if this group does not overlap with the requested range, | ||||
|             // push it without sorting and splitting it | ||||
|             if documents_seen + group.len() < distinct_raw_offset { | ||||
|                 documents_seen += group.len(); | ||||
|                 groups.push(group); | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             let before_criterion_preparation = Instant::now(); | ||||
|             criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); | ||||
|             debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); | ||||
|  | ||||
|             let before_criterion_sort = Instant::now(); | ||||
|             group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); | ||||
|             debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); | ||||
|  | ||||
|             for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { | ||||
|                 // we must compute the real distinguished len of this sub-group | ||||
|                 for document in group.iter() { | ||||
|                     let filter_accepted = match &filter { | ||||
|                         Some(filter) => { | ||||
|                             let entry = filter_map.entry(document.id); | ||||
|                             *entry.or_insert_with(|| (filter)(document.id)) | ||||
|                         } | ||||
|                         None => true, | ||||
|                     }; | ||||
|  | ||||
|                     if filter_accepted { | ||||
|                         let entry = key_cache.entry(document.id); | ||||
|                         let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); | ||||
|  | ||||
|                         match key.clone() { | ||||
|                             Some(key) => buf_distinct.register(key), | ||||
|                             None => buf_distinct.register_without_key(), | ||||
|                         }; | ||||
|                     } | ||||
|  | ||||
|                     // the requested range end is reached: stop computing distinct | ||||
|                     if buf_distinct.len() >= range.end { | ||||
|                         break; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 documents_seen += group.len(); | ||||
|                 groups.push(group); | ||||
|  | ||||
|                 // if this sub-group does not overlap with the requested range | ||||
|                 // we must update the distinct map and its start index | ||||
|                 if buf_distinct.len() < range.start { | ||||
|                     buf_distinct.transfert_to_internal(); | ||||
|                     distinct_raw_offset = documents_seen; | ||||
|                 } | ||||
|  | ||||
|                 // we have sort enough documents if the last document sorted is after | ||||
|                 // the end of the requested range, we can continue to the next criterion | ||||
|                 if buf_distinct.len() >= range.end { | ||||
|                     continue 'criteria; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // once we classified the documents related to the current | ||||
|     // automatons we save that as the next valid result | ||||
|     let mut seen = BufferedDistinctMap::new(&mut distinct_map); | ||||
|  | ||||
|     let mut documents = Vec::with_capacity(range.len()); | ||||
|     for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { | ||||
|         let filter_accepted = match &filter { | ||||
|             Some(_) => filter_map.remove(&raw_document.id).unwrap(), | ||||
|             None => true, | ||||
|         }; | ||||
|  | ||||
|         if filter_accepted { | ||||
|             let key = key_cache.remove(&raw_document.id).unwrap(); | ||||
|             let distinct_accepted = match key { | ||||
|                 Some(key) => seen.register(key), | ||||
|                 None => seen.register_without_key(), | ||||
|             }; | ||||
|  | ||||
|             if distinct_accepted && seen.len() > range.start { | ||||
|                 documents.push(Document::from_raw(raw_document, &automatons, &arena)); | ||||
|                 if documents.len() == range.len() { | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(documents) | ||||
| } | ||||
|  | ||||
| pub struct BareMatch<'tag> { | ||||
|     pub document_id: DocumentId, | ||||
|     pub query_index: u16, | ||||
| @@ -257,7 +414,7 @@ fn fetch_matches<'txn, 'tag>( | ||||
|     postings_lists_store: store::PostingsLists, | ||||
| ) -> MResult<Vec<BareMatch<'tag>>> | ||||
| { | ||||
|     let mut before_words_fst = Instant::now(); | ||||
|     let before_words_fst = Instant::now(); | ||||
|     let words = match main_store.words_fst(reader)? { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(Vec::new()), | ||||
| @@ -273,7 +430,7 @@ fn fetch_matches<'txn, 'tag>( | ||||
|     for (query_index, automaton) in automatons.iter().enumerate() { | ||||
|         let before_dfa = Instant::now(); | ||||
|         let dfa = automaton.dfa(); | ||||
|         let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton; | ||||
|         let QueryWordAutomaton { query, is_exact, .. } = automaton; | ||||
|         dfa_time += before_dfa.elapsed(); | ||||
|  | ||||
|         let mut number_of_words = 0; | ||||
| @@ -381,7 +538,35 @@ impl QueryWordAutomaton { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn construct_automatons2( | ||||
| fn split_best_frequency<'a>( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     word: &'a str, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
| ) -> MResult<Option<(&'a str, &'a str)>> { | ||||
|     let chars = word.char_indices().skip(1); | ||||
|     let mut best = None; | ||||
|  | ||||
|     for (i, _) in chars { | ||||
|         let (left, right) = word.split_at(i); | ||||
|  | ||||
|         let left_freq = postings_lists_store | ||||
|             .postings_list(reader, left.as_ref())? | ||||
|             .map_or(0, |i| i.len()); | ||||
|  | ||||
|         let right_freq = postings_lists_store | ||||
|             .postings_list(reader, right.as_ref())? | ||||
|             .map_or(0, |i| i.len()); | ||||
|  | ||||
|         let min_freq = cmp::min(left_freq, right_freq); | ||||
|         if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { | ||||
|             best = Some((min_freq, left, right)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(best.map(|(_, l, r)| (l, r))) | ||||
| } | ||||
|  | ||||
| fn construct_automatons( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     query: &str, | ||||
|     main_store: store::Main, | ||||
|   | ||||
| @@ -30,6 +30,10 @@ pub use self::store::Index; | ||||
| pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | ||||
| pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; | ||||
| use crate::levenshtein::prefix_damerau_levenshtein; | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| pub struct Document { | ||||
|     pub id: DocumentId, | ||||
| @@ -39,6 +43,36 @@ pub struct Document { | ||||
|     // pub matches: Vec<TmpMatch>, | ||||
| } | ||||
|  | ||||
| impl Document { | ||||
|     pub fn from_raw<'a, 'tag, 'txn>( | ||||
|         raw_document: RawDocument<'a, 'tag>, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         arena: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Document | ||||
|     { | ||||
|         let highlights = raw_document.raw_matches.iter().flat_map(|sm| { | ||||
|             let postings_list = &arena[sm.postings_list]; | ||||
|             let input = postings_list.input(); | ||||
|             let query = &automatons[sm.query_index as usize].query; | ||||
|             postings_list.iter().map(move |m| { | ||||
|                 let covered_area = if query.len() > input.len() { | ||||
|                     input.len() | ||||
|                 } else { | ||||
|                     prefix_damerau_levenshtein(query.as_bytes(), input).1 | ||||
|                 }; | ||||
|  | ||||
|                 Highlight { | ||||
|                     attribute: m.attribute, | ||||
|                     char_index: m.char_index, | ||||
|                     char_length: covered_area as u16, | ||||
|                 } | ||||
|             }) | ||||
|         }).collect(); | ||||
|  | ||||
|         Document { id: raw_document.id, highlights } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|   | ||||
| @@ -1,7 +1,8 @@ | ||||
| use std::ops::Range; | ||||
| use std::time::Duration; | ||||
|  | ||||
| use crate::{bucket_sort::bucket_sort, database::MainT}; | ||||
| use crate::database::MainT; | ||||
| use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct}; | ||||
| use crate::{criterion::Criteria, Document, DocumentId}; | ||||
| use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | ||||
|  | ||||
| @@ -85,11 +86,24 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | ||||
|         range: Range<usize>, | ||||
|     ) -> MResult<Vec<Document>> { | ||||
|         match self.distinct { | ||||
|             Some((distinct, distinct_size)) => unimplemented!("distinct"), | ||||
|             Some((distinct, distinct_size)) => bucket_sort_with_distinct( | ||||
|                 reader, | ||||
|                 query, | ||||
|                 range, | ||||
|                 self.filter, | ||||
|                 distinct, | ||||
|                 distinct_size, | ||||
|                 self.criteria, | ||||
|                 self.main_store, | ||||
|                 self.postings_lists_store, | ||||
|                 self.documents_fields_counts_store, | ||||
|                 self.synonyms_store, | ||||
|             ), | ||||
|             None => bucket_sort( | ||||
|                 reader, | ||||
|                 query, | ||||
|                 range, | ||||
|                 self.filter, | ||||
|                 self.criteria, | ||||
|                 self.main_store, | ||||
|                 self.postings_lists_store, | ||||
|   | ||||
| @@ -44,7 +44,7 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||
|                     let pla = &postings_lists[a.postings_list]; | ||||
|                     let plb = &postings_lists[b.postings_list]; | ||||
|  | ||||
|                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||
|                     let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||
|                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||
|                     }); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user