mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	Update the criteria to the new ones
This commit is contained in:
		| @@ -1,9 +1,6 @@ | ||||
| use std::ops::Deref; | ||||
| use std::fmt; | ||||
| use std::borrow::Cow; | ||||
| use std::cmp::Ordering; | ||||
| use std::collections::HashSet; | ||||
| use std::io::Write; | ||||
| use std::mem; | ||||
| use std::ops::Range; | ||||
| use std::rc::Rc; | ||||
| @@ -17,15 +14,15 @@ use meilisearch_tokenizer::{is_cjk, split_query_string}; | ||||
| use meilisearch_types::{DocIndex, Highlight}; | ||||
| use sdset::{Set, SetBuf}; | ||||
| use slice_group_by::{GroupBy, GroupByMut}; | ||||
| use itertools::EitherOrBoth; | ||||
|  | ||||
| use crate::automaton::NGRAMS; | ||||
| use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; | ||||
| use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||
| use crate::automaton::{normalize_str, split_best_frequency}; | ||||
|  | ||||
| use crate::criterion2::*; | ||||
| use crate::criterion::Criteria; | ||||
| use crate::levenshtein::prefix_damerau_levenshtein; | ||||
| use crate::raw_document::RawDocument; | ||||
| use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; | ||||
| use crate::{store, Document, DocumentId, MResult}; | ||||
|  | ||||
| @@ -33,6 +30,7 @@ pub fn bucket_sort<'c>( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     query: &str, | ||||
|     range: Range<usize>, | ||||
|     criteria: Criteria<'c>, | ||||
|     main_store: store::Main, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
|     documents_fields_counts_store: store::DocumentsFieldsCounts, | ||||
| @@ -76,17 +74,7 @@ pub fn bucket_sort<'c>( | ||||
|  | ||||
|     let mut groups = vec![raw_documents.as_mut_slice()]; | ||||
|  | ||||
|     let criteria = [ | ||||
|         Box::new(Typo) as Box<dyn Criterion>, | ||||
|         Box::new(Words), | ||||
|         Box::new(Proximity), | ||||
|         Box::new(Attribute), | ||||
|         Box::new(WordsPosition), | ||||
|         Box::new(Exact), | ||||
|         Box::new(StableDocId), | ||||
|     ]; | ||||
|  | ||||
|     'criteria: for criterion in &criteria { | ||||
|     'criteria: for criterion in criteria.as_ref() { | ||||
|         let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||
|         let mut documents_seen = 0; | ||||
|  | ||||
| @@ -131,7 +119,7 @@ pub fn bucket_sort<'c>( | ||||
|         }).collect(); | ||||
|  | ||||
|         Document { | ||||
|             id: d.raw_matches[0].document_id, | ||||
|             id: d.id, | ||||
|             highlights, | ||||
|             #[cfg(test)] matches: Vec::new(), | ||||
|         } | ||||
| @@ -140,88 +128,6 @@ pub fn bucket_sort<'c>( | ||||
|     Ok(iter.collect()) | ||||
| } | ||||
|  | ||||
| pub struct RawDocument<'a, 'tag> { | ||||
|     pub raw_matches: &'a mut [BareMatch<'tag>], | ||||
|     pub processed_matches: Vec<SimpleMatch>, | ||||
|     /// The list of minimum `distance` found | ||||
|     pub processed_distances: Vec<Option<u8>>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||
|     fn new<'txn>( | ||||
|         raw_matches: &'a mut [BareMatch<'tag>], | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Option<RawDocument<'a, 'tag>> | ||||
|     { | ||||
|         raw_matches.sort_unstable_by_key(|m| m.query_index); | ||||
|  | ||||
|         let mut previous_word = None; | ||||
|         for i in 0..raw_matches.len() { | ||||
|             let a = &raw_matches[i]; | ||||
|             let auta = &automatons[a.query_index as usize]; | ||||
|  | ||||
|             match auta.phrase_query { | ||||
|                 Some((0, _)) => { | ||||
|                     let b = match raw_matches.get(i + 1) { | ||||
|                         Some(b) => b, | ||||
|                         None => { | ||||
|                             postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                             continue; | ||||
|                         } | ||||
|                     }; | ||||
|  | ||||
|                     if a.query_index + 1 != b.query_index { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                         continue | ||||
|                     } | ||||
|  | ||||
|                     let pla = &postings_lists[a.postings_list]; | ||||
|                     let plb = &postings_lists[b.postings_list]; | ||||
|  | ||||
|                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||
|                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||
|                     }); | ||||
|  | ||||
|                     let mut newa = Vec::new(); | ||||
|                     let mut newb = Vec::new(); | ||||
|  | ||||
|                     for eb in iter { | ||||
|                         if let EitherOrBoth::Both(a, b) = eb { | ||||
|                             newa.push(*a); | ||||
|                             newb.push(*b); | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     if !newa.is_empty() { | ||||
|                         previous_word = Some(a.query_index); | ||||
|                     } | ||||
|  | ||||
|                     postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); | ||||
|                     postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); | ||||
|                 }, | ||||
|                 Some((1, _)) => { | ||||
|                     if previous_word.take() != Some(a.query_index - 1) { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                     } | ||||
|                 }, | ||||
|                 Some((_, _)) => unreachable!(), | ||||
|                 None => (), | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { | ||||
|             return None | ||||
|         } | ||||
|  | ||||
|         Some(RawDocument { | ||||
|             raw_matches, | ||||
|             processed_matches: Vec::new(), | ||||
|             processed_distances: Vec::new(), | ||||
|         }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct BareMatch<'tag> { | ||||
|     pub document_id: DocumentId, | ||||
|     pub query_index: u16, | ||||
|   | ||||
							
								
								
									
										48
									
								
								meilisearch-core/src/criterion/attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								meilisearch-core/src/criterion/attribute.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| use std::cmp::{self, Ordering}; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| use super::{Criterion, prepare_raw_matches}; | ||||
|  | ||||
| pub struct Attribute; | ||||
|  | ||||
| impl Criterion for Attribute { | ||||
|     fn name(&self) -> &str { "attribute" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn best_attribute(matches: &[SimpleMatch]) -> u16 { | ||||
|             let mut best_attribute = u16::max_value(); | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 best_attribute = cmp::min(best_attribute, group[0].attribute); | ||||
|             } | ||||
|             best_attribute | ||||
|         } | ||||
|  | ||||
|         let lhs = best_attribute(&lhs.processed_matches); | ||||
|         let rhs = best_attribute(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
| @@ -1,16 +1,37 @@ | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| use compact_arena::SmallArena; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
| use super::Criterion; | ||||
|  | ||||
| pub struct DocumentId; | ||||
|  | ||||
| impl Criterion for DocumentId { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         lhs.id.cmp(&rhs.id) | ||||
|     fn name(&self) -> &str { "stable document id" } | ||||
|  | ||||
|     fn prepare( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument], | ||||
|         postings_lists: &mut SmallArena<PostingsListView>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         // ... | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "DocumentId" | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &SmallArena<PostingsListView>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         let lhs = &lhs.id; | ||||
|         let rhs = &rhs.id; | ||||
|  | ||||
|         lhs.cmp(rhs) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,131 +1,51 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::cmp::{Ordering, Reverse}; | ||||
|  | ||||
| use sdset::Set; | ||||
| use compact_arena::SmallArena; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::criterion::Criterion; | ||||
| use crate::{AttrCount, RawDocument}; | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
| use super::Criterion; | ||||
|  | ||||
| #[inline] | ||||
| fn number_exact_matches( | ||||
|     query_index: &[u32], | ||||
|     attribute: &[u16], | ||||
|     is_exact: &[bool], | ||||
|     fields_counts: &Set<AttrCount>, | ||||
| ) -> usize { | ||||
|     let mut count = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         let len = group.len(); | ||||
|  | ||||
|         let mut found_exact = false; | ||||
|         for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { | ||||
|             if *is_exact { | ||||
|                 found_exact = true; | ||||
|                 let attr = &attribute[index + pos]; | ||||
|                 if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { | ||||
|                     let AttrCount { count, .. } = fields_counts[pos]; | ||||
|                     if count == 1 { | ||||
|                         return usize::max_value(); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         count += found_exact as usize; | ||||
|         index += len; | ||||
|     } | ||||
|  | ||||
|     count | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct Exact; | ||||
|  | ||||
| impl Criterion for Exact { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let is_exact = lhs.is_exact(); | ||||
|             let attribute = lhs.attribute(); | ||||
|             let fields_counts = lhs.fields_counts.as_ref().unwrap(); | ||||
|     fn name(&self) -> &str { "exact" } | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|     fn prepare( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument], | ||||
|         postings_lists: &mut SmallArena<PostingsListView>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         for document in documents { | ||||
|             document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let is_exact = rhs.is_exact(); | ||||
|             let attribute = rhs.attribute(); | ||||
|             let fields_counts = rhs.fields_counts.as_ref().unwrap(); | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &SmallArena<PostingsListView>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn sum_exact_query_words(matches: &[BareMatch]) -> usize { | ||||
|             let mut sum_exact_query_words = 0; | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 sum_exact_query_words += group[0].is_exact as usize; | ||||
|             } | ||||
|  | ||||
|             sum_exact_query_words | ||||
|         } | ||||
|  | ||||
|         let lhs = sum_exact_query_words(&lhs.raw_matches); | ||||
|         let rhs = sum_exact_query_words(&rhs.raw_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "Exact" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     // typing: "soulier" | ||||
|     // | ||||
|     // doc0: "Soulier bleu" | ||||
|     // doc1: "souliereres rouge" | ||||
|     #[test] | ||||
|     fn easy_case() { | ||||
|         let doc0 = { | ||||
|             let query_index = &[0]; | ||||
|             let attribute = &[0]; | ||||
|             let is_exact = &[true]; | ||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|  | ||||
|         let doc1 = { | ||||
|             let query_index = &[0]; | ||||
|             let attribute = &[0]; | ||||
|             let is_exact = &[false]; | ||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
|  | ||||
|     // typing: "soulier" | ||||
|     // | ||||
|     // doc0: { 0. "soulier" } | ||||
|     // doc1: { 0. "soulier bleu et blanc" } | ||||
|     #[test] | ||||
|     fn basic() { | ||||
|         let doc0 = { | ||||
|             let query_index = &[0]; | ||||
|             let attribute = &[0]; | ||||
|             let is_exact = &[true]; | ||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|  | ||||
|         let doc1 = { | ||||
|             let query_index = &[0]; | ||||
|             let attribute = &[0]; | ||||
|             let is_exact = &[true]; | ||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); | ||||
|  | ||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,58 +1,58 @@ | ||||
| mod document_id; | ||||
| mod exact; | ||||
| mod number_of_words; | ||||
| mod sort_by_attr; | ||||
| mod sum_of_typos; | ||||
| mod sum_of_words_attribute; | ||||
| mod sum_of_words_position; | ||||
| mod words_proximity; | ||||
| use std::cmp::{self, Ordering}; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use sdset::SetBuf; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| pub use self::{ | ||||
|     document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, | ||||
|     sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, | ||||
|     sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, | ||||
|     words_proximity::WordsProximity, | ||||
| }; | ||||
| mod typo; | ||||
| mod words; | ||||
| mod proximity; | ||||
| mod attribute; | ||||
| mod words_position; | ||||
| mod exact; | ||||
| mod document_id; | ||||
| mod sort_by_attr; | ||||
|  | ||||
| pub trait Criterion: Send + Sync { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; | ||||
| pub use self::typo::Typo; | ||||
| pub use self::words::Words; | ||||
| pub use self::proximity::Proximity; | ||||
| pub use self::attribute::Attribute; | ||||
| pub use self::words_position::WordsPosition; | ||||
| pub use self::exact::Exact; | ||||
| pub use self::document_id::DocumentId; | ||||
| pub use self::sort_by_attr::SortByAttr; | ||||
|  | ||||
| pub trait Criterion { | ||||
|     fn name(&self) -> &str; | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ); | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Ordering; | ||||
|  | ||||
|     #[inline] | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         self.evaluate(lhs, rhs) == Ordering::Equal | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         (**self).evaluate(lhs, rhs) | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         (**self).name() | ||||
|     } | ||||
|  | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         (**self).eq(lhs, rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<T: Criterion + ?Sized> Criterion for Box<T> { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         (**self).evaluate(lhs, rhs) | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         (**self).name() | ||||
|     } | ||||
|  | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         (**self).eq(lhs, rhs) | ||||
|     fn eq<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> bool | ||||
|     { | ||||
|         self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -103,11 +103,11 @@ pub struct Criteria<'a> { | ||||
| impl<'a> Default for Criteria<'a> { | ||||
|     fn default() -> Self { | ||||
|         CriteriaBuilder::with_capacity(7) | ||||
|             .add(SumOfTypos) | ||||
|             .add(NumberOfWords) | ||||
|             .add(WordsProximity) | ||||
|             .add(SumOfWordsAttribute) | ||||
|             .add(SumOfWordsPosition) | ||||
|             .add(Typo) | ||||
|             .add(Words) | ||||
|             .add(Proximity) | ||||
|             .add(Attribute) | ||||
|             .add(WordsPosition) | ||||
|             .add(Exact) | ||||
|             .add(DocumentId) | ||||
|             .build() | ||||
| @@ -119,3 +119,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> { | ||||
|         &self.inner | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn prepare_query_distances<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
|     postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
| ) { | ||||
|     for document in documents { | ||||
|         if !document.processed_distances.is_empty() { continue } | ||||
|  | ||||
|         let mut processed = Vec::new(); | ||||
|         for m in document.raw_matches.iter() { | ||||
|             if postings_lists[m.postings_list].is_empty() { continue } | ||||
|  | ||||
|             let range = query_enhancer.replacement(m.query_index as u32); | ||||
|             let new_len = cmp::max(range.end as usize, processed.len()); | ||||
|             processed.resize(new_len, None); | ||||
|  | ||||
|             for index in range { | ||||
|                 let index = index as usize; | ||||
|                 processed[index] = match processed[index] { | ||||
|                     Some(distance) if distance > m.distance => Some(m.distance), | ||||
|                     Some(distance) => Some(distance), | ||||
|                     None => Some(m.distance), | ||||
|                 }; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         document.processed_distances = processed; | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn prepare_raw_matches<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
| ) { | ||||
|     for document in documents { | ||||
|         if !document.processed_matches.is_empty() { continue } | ||||
|  | ||||
|         let mut processed = Vec::new(); | ||||
|         for m in document.raw_matches.iter() { | ||||
|             let postings_list = &postings_lists[m.postings_list]; | ||||
|             processed.reserve(postings_list.len()); | ||||
|             for di in postings_list.as_ref() { | ||||
|                 let simple_match = SimpleMatch { | ||||
|                     query_index: m.query_index, | ||||
|                     distance: m.distance, | ||||
|                     attribute: di.attribute, | ||||
|                     word_index: di.word_index, | ||||
|                     is_exact: m.is_exact, | ||||
|                 }; | ||||
|                 processed.push(simple_match); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); | ||||
|         document.processed_matches = processed.into_vec(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn multiword_rewrite_matches( | ||||
|     matches: &mut [SimpleMatch], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
| ) -> SetBuf<SimpleMatch> | ||||
| { | ||||
|     matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); | ||||
|  | ||||
|     let mut padded_matches = Vec::with_capacity(matches.len()); | ||||
|  | ||||
|     // let before_padding = Instant::now(); | ||||
|     // for each attribute of each document | ||||
|     for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { | ||||
|         // padding will only be applied | ||||
|         // to word indices in the same attribute | ||||
|         let mut padding = 0; | ||||
|         let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); | ||||
|  | ||||
|         // for each match at the same position | ||||
|         // in this document attribute | ||||
|         while let Some(same_word_index) = iter.next() { | ||||
|             // find the biggest padding | ||||
|             let mut biggest = 0; | ||||
|             for match_ in same_word_index { | ||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index as u32); | ||||
|                 let replacement_len = replacement.len(); | ||||
|                 let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); | ||||
|  | ||||
|                 if let Some(query_index) = replacement.next() { | ||||
|                     let word_index = match_.word_index + padding as u16; | ||||
|                     let query_index = query_index as u16; | ||||
|                     let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                     padded_matches.push(match_); | ||||
|                 } | ||||
|  | ||||
|                 let mut found = false; | ||||
|  | ||||
|                 // look ahead and if there already is a match | ||||
|                 // corresponding to this padding word, abort the padding | ||||
|                 'padding: for (x, next_group) in nexts.enumerate() { | ||||
|                     for (i, query_index) in replacement.clone().enumerate().skip(x) { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let query_index = query_index as u16; | ||||
|                         let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|  | ||||
|                         for nmatch_ in next_group { | ||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); | ||||
|                             let query_index = rep.next().unwrap() as u16; | ||||
|                             if query_index == padmatch.query_index { | ||||
|                                 if !found { | ||||
|                                     // if we find a corresponding padding for the | ||||
|                                     // first time we must push preceding paddings | ||||
|                                     for (i, query_index) in replacement.clone().enumerate().take(i) | ||||
|                                     { | ||||
|                                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                                         let query_index = query_index as u16; | ||||
|                                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                                         padded_matches.push(match_); | ||||
|                                         biggest = biggest.max(i + 1); | ||||
|                                     } | ||||
|                                 } | ||||
|  | ||||
|                                 padded_matches.push(padmatch); | ||||
|                                 found = true; | ||||
|                                 continue 'padding; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // if we do not find a corresponding padding in the | ||||
|                     // next groups so stop here and pad what was found | ||||
|                     break; | ||||
|                 } | ||||
|  | ||||
|                 if !found { | ||||
|                     // if no padding was found in the following matches | ||||
|                     // we must insert the entire padding | ||||
|                     for (i, query_index) in replacement.enumerate() { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let query_index = query_index as u16; | ||||
|                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                         padded_matches.push(match_); | ||||
|                     } | ||||
|  | ||||
|                     biggest = biggest.max(replacement_len - 1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             padding += biggest; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // debug!("padding matches took {:.02?}", before_padding.elapsed()); | ||||
|  | ||||
|     // With this check we can see that the loop above takes something | ||||
|     // like 43% of the search time even when no rewrite is needed. | ||||
|     // assert_eq!(before_matches, padded_matches); | ||||
|  | ||||
|     SetBuf::from_dirty(padded_matches) | ||||
| } | ||||
|   | ||||
| @@ -1,31 +0,0 @@ | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
| use slice_group_by::GroupBy; | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| #[inline] | ||||
| fn number_of_query_words(query_index: &[u32]) -> usize { | ||||
|     query_index.linear_group().count() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct NumberOfWords; | ||||
|  | ||||
| impl Criterion for NumberOfWords { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             number_of_query_words(query_index) | ||||
|         }; | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             number_of_query_words(query_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "NumberOfWords" | ||||
|     } | ||||
| } | ||||
							
								
								
									
										79
									
								
								meilisearch-core/src/criterion/proximity.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								meilisearch-core/src/criterion/proximity.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,79 @@ | ||||
| use std::cmp::{self, Ordering}; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| use super::{Criterion, prepare_raw_matches}; | ||||
|  | ||||
| pub struct Proximity; | ||||
|  | ||||
| impl Criterion for Proximity { | ||||
|     fn name(&self) -> &str { "proximity" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         const MAX_DISTANCE: u16 = 8; | ||||
|  | ||||
|         fn index_proximity(lhs: u16, rhs: u16) -> u16 { | ||||
|             if lhs < rhs { | ||||
|                 cmp::min(rhs - lhs, MAX_DISTANCE) | ||||
|             } else { | ||||
|                 cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { | ||||
|             if lhs.attribute != rhs.attribute { MAX_DISTANCE } | ||||
|             else { index_proximity(lhs.word_index, rhs.word_index) } | ||||
|         } | ||||
|  | ||||
|         fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { | ||||
|             let mut min_prox = u16::max_value(); | ||||
|             for a in lhs { | ||||
|                 for b in rhs { | ||||
|                     let prox = attribute_proximity(*a, *b); | ||||
|                     min_prox = cmp::min(min_prox, prox); | ||||
|                 } | ||||
|             } | ||||
|             min_prox | ||||
|         } | ||||
|  | ||||
|         fn matches_proximity(matches: &[SimpleMatch],) -> u16 { | ||||
|             let mut proximity = 0; | ||||
|             let mut iter = matches.linear_group_by_key(|m| m.query_index); | ||||
|  | ||||
|             // iterate over groups by windows of size 2 | ||||
|             let mut last = iter.next(); | ||||
|             while let (Some(lhs), Some(rhs)) = (last, iter.next()) { | ||||
|                 proximity += min_proximity(lhs, rhs); | ||||
|                 last = Some(rhs); | ||||
|             } | ||||
|  | ||||
|             proximity | ||||
|         } | ||||
|  | ||||
|         let lhs = matches_proximity(&lhs.processed_matches); | ||||
|         let rhs = matches_proximity(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
| @@ -2,9 +2,13 @@ use std::cmp::Ordering; | ||||
| use std::error::Error; | ||||
| use std::fmt; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use meilisearch_schema::{Schema, SchemaAttr}; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::{RankedMap, RawDocument}; | ||||
| use meilisearch_schema::{Schema, SchemaAttr}; | ||||
|  | ||||
| /// An helper struct that permit to sort documents by | ||||
| /// some of their stored attributes. | ||||
| @@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr}; | ||||
| /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; | ||||
| /// | ||||
| /// let builder = CriteriaBuilder::with_capacity(8) | ||||
| ///        .add(SumOfTypos) | ||||
| ///        .add(NumberOfWords) | ||||
| ///        .add(WordsProximity) | ||||
| ///        .add(SumOfWordsAttribute) | ||||
| ///        .add(SumOfWordsPosition) | ||||
| ///        .add(Typo) | ||||
| ///        .add(Words) | ||||
| ///        .add(Proximity) | ||||
| ///        .add(Attribute) | ||||
| ///        .add(WordsPosition) | ||||
| ///        .add(Exact) | ||||
| ///        .add(custom_ranking) | ||||
| ///        .add(DocumentId); | ||||
| @@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> Criterion for SortByAttr<'a> { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
| impl Criterion for SortByAttr<'_> { | ||||
|     fn name(&self) -> &str { | ||||
|         "sort by attribute" | ||||
|     } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         // ... | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         let lhs = self.ranked_map.get(lhs.id, self.attr); | ||||
|         let rhs = self.ranked_map.get(rhs.id, self.attr); | ||||
|  | ||||
| @@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> { | ||||
|             (None, None) => Ordering::Equal, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "SortByAttr" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||||
|   | ||||
| @@ -1,116 +0,0 @@ | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| // This function is a wrong logarithmic 10 function. | ||||
| // It is safe to panic on input number higher than 3, | ||||
| // the number of typos is never bigger than that. | ||||
| #[inline] | ||||
| fn custom_log10(n: u8) -> f32 { | ||||
|     match n { | ||||
|         0 => 0.0,     // log(1) | ||||
|         1 => 0.30102, // log(2) | ||||
|         2 => 0.47712, // log(3) | ||||
|         3 => 0.60205, // log(4) | ||||
|         _ => panic!("invalid number"), | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { | ||||
|     let mut number_words: usize = 0; | ||||
|     let mut sum_typos = 0.0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_typos += custom_log10(distance[index]); | ||||
|         number_words += 1; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfTypos; | ||||
|  | ||||
| impl Criterion for SumOfTypos { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let distance = lhs.distance(); | ||||
|             sum_matches_typos(query_index, distance) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let distance = rhs.distance(); | ||||
|             sum_matches_typos(query_index, distance) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "SumOfTypos" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     // typing: "Geox CEO" | ||||
|     // | ||||
|     // doc0: "Geox SpA: CEO and Executive" | ||||
|     // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" | ||||
|     #[test] | ||||
|     fn one_typo_reference() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 0]; | ||||
|  | ||||
|         let query_index1 = &[0, 1]; | ||||
|         let distance1 = &[1, 0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
|  | ||||
|     // typing: "bouton manchette" | ||||
|     // | ||||
|     // doc0: "bouton manchette" | ||||
|     // doc1: "bouton" | ||||
|     #[test] | ||||
|     fn no_typo() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 0]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let distance1 = &[0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
|  | ||||
|     // typing: "bouton manchztte" | ||||
|     // | ||||
|     // doc0: "bouton manchette" | ||||
|     // doc1: "bouton" | ||||
|     #[test] | ||||
|     fn one_typo() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 1]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let distance1 = &[0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
| } | ||||
| @@ -1,64 +0,0 @@ | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
| use slice_group_by::GroupBy; | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { | ||||
|     let mut sum_attributes = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_attributes += attribute[index] as usize; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     sum_attributes | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfWordsAttribute; | ||||
|  | ||||
| impl Criterion for SumOfWordsAttribute { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let attribute = lhs.attribute(); | ||||
|             sum_matches_attributes(query_index, attribute) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let attribute = rhs.attribute(); | ||||
|             sum_matches_attributes(query_index, attribute) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "SumOfWordsAttribute" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     // typing: "soulier" | ||||
|     // | ||||
|     // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } | ||||
|     // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } | ||||
|     #[test] | ||||
|     fn title_vs_description() { | ||||
|         let query_index0 = &[0]; | ||||
|         let attribute0 = &[0]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let attribute1 = &[1]; | ||||
|  | ||||
|         let doc0 = sum_matches_attributes(query_index0, attribute0); | ||||
|         let doc1 = sum_matches_attributes(query_index1, attribute1); | ||||
|         assert_eq!(doc0.cmp(&doc1), Ordering::Less); | ||||
|     } | ||||
| } | ||||
| @@ -1,64 +0,0 @@ | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
| use slice_group_by::GroupBy; | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { | ||||
|     let mut sum_word_index = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_word_index += word_index[index] as usize; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     sum_word_index | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfWordsPosition; | ||||
|  | ||||
| impl Criterion for SumOfWordsPosition { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let word_index = lhs.word_index(); | ||||
|             sum_matches_attribute_index(query_index, word_index) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let word_index = rhs.word_index(); | ||||
|             sum_matches_attribute_index(query_index, word_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "SumOfWordsPosition" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     // typing: "soulier" | ||||
|     // | ||||
|     // doc0: "Soulier bleu" | ||||
|     // doc1: "Botte rouge et soulier noir" | ||||
|     #[test] | ||||
|     fn easy_case() { | ||||
|         let query_index0 = &[0]; | ||||
|         let word_index0 = &[0]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let word_index1 = &[3]; | ||||
|  | ||||
|         let doc0 = sum_matches_attribute_index(query_index0, word_index0); | ||||
|         let doc1 = sum_matches_attribute_index(query_index1, word_index1); | ||||
|         assert_eq!(doc0.cmp(&doc1), Ordering::Less); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										67
									
								
								meilisearch-core/src/criterion/typo.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								meilisearch-core/src/criterion/typo.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| use super::{Criterion, prepare_query_distances}; | ||||
|  | ||||
| pub struct Typo; | ||||
|  | ||||
| impl Criterion for Typo { | ||||
|     fn name(&self) -> &str { "typo" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &SmallArena<PostingsListView>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         // This function is a wrong logarithmic 10 function. | ||||
|         // It is safe to panic on input number higher than 3, | ||||
|         // the number of typos is never bigger than that. | ||||
|         #[inline] | ||||
|         fn custom_log10(n: u8) -> f32 { | ||||
|             match n { | ||||
|                 0 => 0.0,     // log(1) | ||||
|                 1 => 0.30102, // log(2) | ||||
|                 2 => 0.47712, // log(3) | ||||
|                 3 => 0.60205, // log(4) | ||||
|                 _ => panic!("invalid number"), | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         #[inline] | ||||
|         fn compute_typos(distances: &[Option<u8>]) -> usize { | ||||
|             let mut number_words: usize = 0; | ||||
|             let mut sum_typos = 0.0; | ||||
|  | ||||
|             for distance in distances { | ||||
|                 if let Some(distance) = distance { | ||||
|                     sum_typos += custom_log10(*distance); | ||||
|                     number_words += 1; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize | ||||
|         } | ||||
|  | ||||
|         let lhs = compute_typos(&lhs.processed_distances); | ||||
|         let rhs = compute_typos(&rhs.processed_distances); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
							
								
								
									
										43
									
								
								meilisearch-core/src/criterion/words.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								meilisearch-core/src/criterion/words.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| use super::{Criterion, prepare_query_distances}; | ||||
|  | ||||
| pub struct Words; | ||||
|  | ||||
| impl Criterion for Words { | ||||
|     fn name(&self) -> &str { "words" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &SmallArena<PostingsListView>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn number_of_query_words(distances: &[Option<u8>]) -> usize { | ||||
|             distances.iter().cloned().filter(Option::is_some).count() | ||||
|         } | ||||
|  | ||||
|         let lhs = number_of_query_words(&lhs.processed_distances); | ||||
|         let rhs = number_of_query_words(&rhs.processed_distances); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
							
								
								
									
										48
									
								
								meilisearch-core/src/criterion/words_position.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								meilisearch-core/src/criterion/words_position.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::automaton::QueryEnhancer; | ||||
| use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| use super::{Criterion, prepare_raw_matches}; | ||||
|  | ||||
| pub struct WordsPosition; | ||||
|  | ||||
| impl Criterion for WordsPosition { | ||||
|     fn name(&self) -> &str { "words position" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn sum_words_position(matches: &[SimpleMatch]) -> usize { | ||||
|             let mut sum_words_position = 0; | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 sum_words_position += group[0].word_index as usize; | ||||
|             } | ||||
|             sum_words_position | ||||
|         } | ||||
|  | ||||
|         let lhs = sum_words_position(&lhs.processed_matches); | ||||
|         let rhs = sum_words_position(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
| @@ -1,164 +0,0 @@ | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
| use slice_group_by::GroupBy; | ||||
| use std::cmp::{self, Ordering}; | ||||
|  | ||||
| const MAX_DISTANCE: u16 = 8; | ||||
|  | ||||
| #[inline] | ||||
| fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) { | ||||
|     (a.clone(), b.clone()) | ||||
| } | ||||
|  | ||||
| fn index_proximity(lhs: u16, rhs: u16) -> u16 { | ||||
|     if lhs < rhs { | ||||
|         cmp::min(rhs - lhs, MAX_DISTANCE) | ||||
|     } else { | ||||
|         cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { | ||||
|     if lattr != rattr { | ||||
|         return MAX_DISTANCE; | ||||
|     } | ||||
|     index_proximity(lwi, rwi) | ||||
| } | ||||
|  | ||||
| fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { | ||||
|     let mut min_prox = u16::max_value(); | ||||
|  | ||||
|     for a in lattr.iter().zip(lwi) { | ||||
|         for b in rattr.iter().zip(rwi) { | ||||
|             let a = clone_tuple(a); | ||||
|             let b = clone_tuple(b); | ||||
|             min_prox = cmp::min(min_prox, attribute_proximity(a, b)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     min_prox | ||||
| } | ||||
|  | ||||
| fn matches_proximity( | ||||
|     query_index: &[u32], | ||||
|     distance: &[u8], | ||||
|     attribute: &[u16], | ||||
|     word_index: &[u16], | ||||
| ) -> u16 { | ||||
|     let mut query_index_groups = query_index.linear_group(); | ||||
|     let mut proximity = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     let get_attr_wi = |index: usize, group_len: usize| { | ||||
|         // retrieve the first distance group (with the lowest values) | ||||
|         let len = distance[index..index + group_len] | ||||
|             .linear_group() | ||||
|             .next() | ||||
|             .unwrap() | ||||
|             .len(); | ||||
|  | ||||
|         let rattr = &attribute[index..index + len]; | ||||
|         let rwi = &word_index[index..index + len]; | ||||
|  | ||||
|         (rattr, rwi) | ||||
|     }; | ||||
|  | ||||
|     let mut last = query_index_groups.next().map(|group| { | ||||
|         let attr_wi = get_attr_wi(index, group.len()); | ||||
|         index += group.len(); | ||||
|         attr_wi | ||||
|     }); | ||||
|  | ||||
|     // iter by windows of size 2 | ||||
|     while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { | ||||
|         let attr_wi = get_attr_wi(index, rhs.len()); | ||||
|         proximity += min_proximity(lhs, attr_wi); | ||||
|         last = Some(attr_wi); | ||||
|         index += rhs.len(); | ||||
|     } | ||||
|  | ||||
|     proximity | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct WordsProximity; | ||||
|  | ||||
| impl Criterion for WordsProximity { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let distance = lhs.distance(); | ||||
|             let attribute = lhs.attribute(); | ||||
|             let word_index = lhs.word_index(); | ||||
|             matches_proximity(query_index, distance, attribute, word_index) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let distance = rhs.distance(); | ||||
|             let attribute = rhs.attribute(); | ||||
|             let word_index = rhs.word_index(); | ||||
|             matches_proximity(query_index, distance, attribute, word_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         "WordsProximity" | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn three_different_attributes() { | ||||
|         // "soup" "of the" "the day" | ||||
|         // | ||||
|         // { id: 0, attr: 0, attr_index: 0 } | ||||
|         // { id: 1, attr: 1, attr_index: 0 } | ||||
|         // { id: 2, attr: 1, attr_index: 1 } | ||||
|         // { id: 2, attr: 2, attr_index: 0 } | ||||
|         // { id: 3, attr: 3, attr_index: 1 } | ||||
|  | ||||
|         let query_index = &[0, 1, 2, 2, 3]; | ||||
|         let distance = &[0, 0, 0, 0, 0]; | ||||
|         let attribute = &[0, 1, 1, 2, 3]; | ||||
|         let word_index = &[0, 0, 1, 0, 1]; | ||||
|  | ||||
|         //   soup -> of = 8 | ||||
|         // + of -> the  = 1 | ||||
|         // + the -> day = 8 (not 1) | ||||
|         assert_eq!( | ||||
|             matches_proximity(query_index, distance, attribute, word_index), | ||||
|             17 | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn two_different_attributes() { | ||||
|         // "soup day" "soup of the day" | ||||
|         // | ||||
|         // { id: 0, attr: 0, attr_index: 0 } | ||||
|         // { id: 0, attr: 1, attr_index: 0 } | ||||
|         // { id: 1, attr: 1, attr_index: 1 } | ||||
|         // { id: 2, attr: 1, attr_index: 2 } | ||||
|         // { id: 3, attr: 0, attr_index: 1 } | ||||
|         // { id: 3, attr: 1, attr_index: 3 } | ||||
|  | ||||
|         let query_index = &[0, 0, 1, 2, 3, 3]; | ||||
|         let distance = &[0, 0, 0, 0, 0, 0]; | ||||
|         let attribute = &[0, 1, 1, 1, 0, 1]; | ||||
|         let word_index = &[0, 0, 1, 2, 1, 3]; | ||||
|  | ||||
|         //   soup -> of = 1 | ||||
|         // + of -> the  = 1 | ||||
|         // + the -> day = 1 | ||||
|         assert_eq!( | ||||
|             matches_proximity(query_index, distance, attribute, word_index), | ||||
|             3 | ||||
|         ); | ||||
|     } | ||||
| } | ||||
| @@ -1,514 +0,0 @@ | ||||
| use std::cmp::{self, Ordering, Reverse}; | ||||
| use std::borrow::Cow; | ||||
| use std::sync::atomic::{self, AtomicUsize}; | ||||
|  | ||||
| use slice_group_by::{GroupBy, GroupByMut}; | ||||
| use compact_arena::SmallArena; | ||||
| use sdset::{Set, SetBuf}; | ||||
| use log::debug; | ||||
|  | ||||
| use crate::{DocIndex, DocumentId}; | ||||
| use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; | ||||
| use crate::automaton::QueryEnhancer; | ||||
|  | ||||
| type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; | ||||
|  | ||||
| pub trait Criterion { | ||||
|     fn name(&self) -> &str; | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ); | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
|     ) -> Ordering; | ||||
|  | ||||
|     #[inline] | ||||
|     fn eq<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
|     ) -> bool | ||||
|     { | ||||
|         self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn prepare_query_distances<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
|     postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
| ) { | ||||
|     for document in documents { | ||||
|         if !document.processed_distances.is_empty() { continue } | ||||
|  | ||||
|         let mut processed = Vec::new(); | ||||
|         for m in document.raw_matches.iter() { | ||||
|             if postings_lists[m.postings_list].is_empty() { continue } | ||||
|  | ||||
|             let range = query_enhancer.replacement(m.query_index as u32); | ||||
|             let new_len = cmp::max(range.end as usize, processed.len()); | ||||
|             processed.resize(new_len, None); | ||||
|  | ||||
|             for index in range { | ||||
|                 let index = index as usize; | ||||
|                 processed[index] = match processed[index] { | ||||
|                     Some(distance) if distance > m.distance => Some(m.distance), | ||||
|                     Some(distance) => Some(distance), | ||||
|                     None => Some(m.distance), | ||||
|                 }; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         document.processed_distances = processed; | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Typo; | ||||
|  | ||||
| impl Criterion for Typo { | ||||
|     fn name(&self) -> &str { "typo" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &PostingsListsArena, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         // This function is a wrong logarithmic 10 function. | ||||
|         // It is safe to panic on input number higher than 3, | ||||
|         // the number of typos is never bigger than that. | ||||
|         #[inline] | ||||
|         fn custom_log10(n: u8) -> f32 { | ||||
|             match n { | ||||
|                 0 => 0.0,     // log(1) | ||||
|                 1 => 0.30102, // log(2) | ||||
|                 2 => 0.47712, // log(3) | ||||
|                 3 => 0.60205, // log(4) | ||||
|                 _ => panic!("invalid number"), | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         #[inline] | ||||
|         fn compute_typos(distances: &[Option<u8>]) -> usize { | ||||
|             let mut number_words: usize = 0; | ||||
|             let mut sum_typos = 0.0; | ||||
|  | ||||
|             for distance in distances { | ||||
|                 if let Some(distance) = distance { | ||||
|                     sum_typos += custom_log10(*distance); | ||||
|                     number_words += 1; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize | ||||
|         } | ||||
|  | ||||
|         let lhs = compute_typos(&lhs.processed_distances); | ||||
|         let rhs = compute_typos(&rhs.processed_distances); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Words; | ||||
|  | ||||
| impl Criterion for Words { | ||||
|     fn name(&self) -> &str { "words" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &PostingsListsArena, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn number_of_query_words(distances: &[Option<u8>]) -> usize { | ||||
|             distances.iter().cloned().filter(Option::is_some).count() | ||||
|         } | ||||
|  | ||||
|         let lhs = number_of_query_words(&lhs.processed_distances); | ||||
|         let rhs = number_of_query_words(&rhs.processed_distances); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn prepare_raw_matches<'a, 'tag, 'txn>( | ||||
|     documents: &mut [RawDocument<'a, 'tag>], | ||||
|     postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
| ) { | ||||
|     for document in documents { | ||||
|         if !document.processed_matches.is_empty() { continue } | ||||
|  | ||||
|         let mut processed = Vec::new(); | ||||
|         for m in document.raw_matches.iter() { | ||||
|             let postings_list = &postings_lists[m.postings_list]; | ||||
|             processed.reserve(postings_list.len()); | ||||
|             for di in postings_list.as_ref() { | ||||
|                 let simple_match = SimpleMatch { | ||||
|                     query_index: m.query_index, | ||||
|                     distance: m.distance, | ||||
|                     attribute: di.attribute, | ||||
|                     word_index: di.word_index, | ||||
|                     is_exact: m.is_exact, | ||||
|                 }; | ||||
|                 processed.push(simple_match); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); | ||||
|         document.processed_matches = processed.into_vec(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Proximity; | ||||
|  | ||||
| impl Criterion for Proximity { | ||||
|     fn name(&self) -> &str { "proximity" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         const MAX_DISTANCE: u16 = 8; | ||||
|  | ||||
|         fn index_proximity(lhs: u16, rhs: u16) -> u16 { | ||||
|             if lhs < rhs { | ||||
|                 cmp::min(rhs - lhs, MAX_DISTANCE) | ||||
|             } else { | ||||
|                 cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { | ||||
|             if lhs.attribute != rhs.attribute { MAX_DISTANCE } | ||||
|             else { index_proximity(lhs.word_index, rhs.word_index) } | ||||
|         } | ||||
|  | ||||
|         fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { | ||||
|             let mut min_prox = u16::max_value(); | ||||
|             for a in lhs { | ||||
|                 for b in rhs { | ||||
|                     let prox = attribute_proximity(*a, *b); | ||||
|                     min_prox = cmp::min(min_prox, prox); | ||||
|                 } | ||||
|             } | ||||
|             min_prox | ||||
|         } | ||||
|  | ||||
|         fn matches_proximity(matches: &[SimpleMatch],) -> u16 { | ||||
|             let mut proximity = 0; | ||||
|             let mut iter = matches.linear_group_by_key(|m| m.query_index); | ||||
|  | ||||
|             // iterate over groups by windows of size 2 | ||||
|             let mut last = iter.next(); | ||||
|             while let (Some(lhs), Some(rhs)) = (last, iter.next()) { | ||||
|                 proximity += min_proximity(lhs, rhs); | ||||
|                 last = Some(rhs); | ||||
|             } | ||||
|  | ||||
|             proximity | ||||
|         } | ||||
|  | ||||
|         let lhs = matches_proximity(&lhs.processed_matches); | ||||
|         let rhs = matches_proximity(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Attribute; | ||||
|  | ||||
| impl Criterion for Attribute { | ||||
|     fn name(&self) -> &str { "attribute" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn best_attribute(matches: &[SimpleMatch]) -> u16 { | ||||
|             let mut best_attribute = u16::max_value(); | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 best_attribute = cmp::min(best_attribute, group[0].attribute); | ||||
|             } | ||||
|             best_attribute | ||||
|         } | ||||
|  | ||||
|         let lhs = best_attribute(&lhs.processed_matches); | ||||
|         let rhs = best_attribute(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct WordsPosition; | ||||
|  | ||||
| impl Criterion for WordsPosition { | ||||
|     fn name(&self) -> &str { "words position" } | ||||
|  | ||||
|     fn prepare<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument<'a, 'tag>], | ||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||
|     } | ||||
|  | ||||
|     fn evaluate<'a, 'tag, 'txn>( | ||||
|         &self, | ||||
|         lhs: &RawDocument<'a, 'tag>, | ||||
|         rhs: &RawDocument<'a, 'tag>, | ||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn sum_words_position(matches: &[SimpleMatch]) -> usize { | ||||
|             let mut sum_words_position = 0; | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 sum_words_position += group[0].word_index as usize; | ||||
|             } | ||||
|             sum_words_position | ||||
|         } | ||||
|  | ||||
|         let lhs = sum_words_position(&lhs.processed_matches); | ||||
|         let rhs = sum_words_position(&rhs.processed_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Exact; | ||||
|  | ||||
| impl Criterion for Exact { | ||||
|     fn name(&self) -> &str { "exact" } | ||||
|  | ||||
|     fn prepare( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument], | ||||
|         postings_lists: &mut PostingsListsArena, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         for document in documents { | ||||
|             document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &PostingsListsArena, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         #[inline] | ||||
|         fn sum_exact_query_words(matches: &[BareMatch]) -> usize { | ||||
|             let mut sum_exact_query_words = 0; | ||||
|  | ||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||
|                 sum_exact_query_words += group[0].is_exact as usize; | ||||
|             } | ||||
|  | ||||
|             sum_exact_query_words | ||||
|         } | ||||
|  | ||||
|         let lhs = sum_exact_query_words(&lhs.raw_matches); | ||||
|         let rhs = sum_exact_query_words(&rhs.raw_matches); | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct StableDocId; | ||||
|  | ||||
| impl Criterion for StableDocId { | ||||
|     fn name(&self) -> &str { "stable document id" } | ||||
|  | ||||
|     fn prepare( | ||||
|         &self, | ||||
|         documents: &mut [RawDocument], | ||||
|         postings_lists: &mut PostingsListsArena, | ||||
|         query_enhancer: &QueryEnhancer, | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|     ) { | ||||
|         // ... | ||||
|     } | ||||
|  | ||||
|     fn evaluate( | ||||
|         &self, | ||||
|         lhs: &RawDocument, | ||||
|         rhs: &RawDocument, | ||||
|         postings_lists: &PostingsListsArena, | ||||
|     ) -> Ordering | ||||
|     { | ||||
|         let lhs = &lhs.raw_matches[0].document_id; | ||||
|         let rhs = &rhs.raw_matches[0].document_id; | ||||
|  | ||||
|         lhs.cmp(rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn multiword_rewrite_matches( | ||||
|     matches: &mut [SimpleMatch], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     automatons: &[QueryWordAutomaton], | ||||
| ) -> SetBuf<SimpleMatch> | ||||
| { | ||||
|     matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); | ||||
|  | ||||
|     let mut padded_matches = Vec::with_capacity(matches.len()); | ||||
|  | ||||
|     // let before_padding = Instant::now(); | ||||
|     // for each attribute of each document | ||||
|     for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { | ||||
|         // padding will only be applied | ||||
|         // to word indices in the same attribute | ||||
|         let mut padding = 0; | ||||
|         let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); | ||||
|  | ||||
|         // for each match at the same position | ||||
|         // in this document attribute | ||||
|         while let Some(same_word_index) = iter.next() { | ||||
|             // find the biggest padding | ||||
|             let mut biggest = 0; | ||||
|             for match_ in same_word_index { | ||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index as u32); | ||||
|                 let replacement_len = replacement.len(); | ||||
|                 let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); | ||||
|  | ||||
|                 if let Some(query_index) = replacement.next() { | ||||
|                     let word_index = match_.word_index + padding as u16; | ||||
|                     let query_index = query_index as u16; | ||||
|                     let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                     padded_matches.push(match_); | ||||
|                 } | ||||
|  | ||||
|                 let mut found = false; | ||||
|  | ||||
|                 // look ahead and if there already is a match | ||||
|                 // corresponding to this padding word, abort the padding | ||||
|                 'padding: for (x, next_group) in nexts.enumerate() { | ||||
|                     for (i, query_index) in replacement.clone().enumerate().skip(x) { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let query_index = query_index as u16; | ||||
|                         let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|  | ||||
|                         for nmatch_ in next_group { | ||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); | ||||
|                             let query_index = rep.next().unwrap() as u16; | ||||
|                             if query_index == padmatch.query_index { | ||||
|                                 if !found { | ||||
|                                     // if we find a corresponding padding for the | ||||
|                                     // first time we must push preceding paddings | ||||
|                                     for (i, query_index) in replacement.clone().enumerate().take(i) | ||||
|                                     { | ||||
|                                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                                         let query_index = query_index as u16; | ||||
|                                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                                         padded_matches.push(match_); | ||||
|                                         biggest = biggest.max(i + 1); | ||||
|                                     } | ||||
|                                 } | ||||
|  | ||||
|                                 padded_matches.push(padmatch); | ||||
|                                 found = true; | ||||
|                                 continue 'padding; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // if we do not find a corresponding padding in the | ||||
|                     // next groups so stop here and pad what was found | ||||
|                     break; | ||||
|                 } | ||||
|  | ||||
|                 if !found { | ||||
|                     // if no padding was found in the following matches | ||||
|                     // we must insert the entire padding | ||||
|                     for (i, query_index) in replacement.enumerate() { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let query_index = query_index as u16; | ||||
|                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||
|                         padded_matches.push(match_); | ||||
|                     } | ||||
|  | ||||
|                     biggest = biggest.max(replacement_len - 1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             padding += biggest; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // debug!("padding matches took {:.02?}", before_padding.elapsed()); | ||||
|  | ||||
|     // With this check we can see that the loop above takes something | ||||
|     // like 43% of the search time even when no rewrite is needed. | ||||
|     // assert_eq!(before_matches, padded_matches); | ||||
|  | ||||
|     SetBuf::from_dirty(padded_matches) | ||||
| } | ||||
| @@ -20,7 +20,6 @@ mod update; | ||||
|  | ||||
| // TODO replace | ||||
| mod bucket_sort; | ||||
| mod criterion2; | ||||
|  | ||||
| pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; | ||||
| pub use self::error::{Error, MResult}; | ||||
| @@ -31,62 +30,13 @@ pub use self::store::Index; | ||||
| pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | ||||
| pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; | ||||
|  | ||||
| #[doc(hidden)] | ||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| pub struct TmpMatch { | ||||
|     pub query_index: u32, | ||||
|     pub distance: u8, | ||||
|     pub attribute: u16, | ||||
|     pub word_index: u16, | ||||
|     pub is_exact: bool, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| pub struct Document { | ||||
|     pub id: DocumentId, | ||||
|     pub highlights: Vec<Highlight>, | ||||
|  | ||||
|     #[cfg(test)] | ||||
|     pub matches: Vec<TmpMatch>, | ||||
| } | ||||
|  | ||||
| impl Document { | ||||
|     #[cfg(not(test))] | ||||
|     fn from_raw(raw: RawDocument) -> Document { | ||||
|         Document { | ||||
|             id: raw.id, | ||||
|             highlights: raw.highlights, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[cfg(test)] | ||||
|     fn from_raw(raw: RawDocument) -> Document { | ||||
|         let len = raw.query_index().len(); | ||||
|         let mut matches = Vec::with_capacity(len); | ||||
|  | ||||
|         let query_index = raw.query_index(); | ||||
|         let distance = raw.distance(); | ||||
|         let attribute = raw.attribute(); | ||||
|         let word_index = raw.word_index(); | ||||
|         let is_exact = raw.is_exact(); | ||||
|  | ||||
|         for i in 0..len { | ||||
|             let match_ = TmpMatch { | ||||
|                 query_index: query_index[i], | ||||
|                 distance: distance[i], | ||||
|                 attribute: attribute[i], | ||||
|                 word_index: word_index[i], | ||||
|                 is_exact: is_exact[i], | ||||
|             }; | ||||
|             matches.push(match_); | ||||
|         } | ||||
|  | ||||
|         Document { | ||||
|             id: raw.id, | ||||
|             matches, | ||||
|             highlights: raw.highlights, | ||||
|         } | ||||
|     } | ||||
|     // #[cfg(test)] | ||||
|     // pub matches: Vec<TmpMatch>, | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
|   | ||||
| @@ -1,21 +1,8 @@ | ||||
| use hashbrown::HashMap; | ||||
| use std::convert::TryFrom; | ||||
| use std::ops::Range; | ||||
| use std::rc::Rc; | ||||
| use std::time::{Duration, Instant}; | ||||
| use std::{cmp, mem}; | ||||
|  | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use log::debug; | ||||
| use sdset::SetBuf; | ||||
| use slice_group_by::{GroupBy, GroupByMut}; | ||||
| use std::time::Duration; | ||||
|  | ||||
| use crate::{bucket_sort::bucket_sort, database::MainT}; | ||||
| use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; | ||||
| use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; | ||||
| use crate::levenshtein::prefix_damerau_levenshtein; | ||||
| use crate::raw_document::{raw_documents_from, RawDocument}; | ||||
| use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; | ||||
| use crate::{criterion::Criteria, Document, DocumentId}; | ||||
| use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | ||||
|  | ||||
| pub struct QueryBuilder<'c, 'f, 'd> { | ||||
| @@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> { | ||||
|     synonyms_store: store::Synonyms, | ||||
| } | ||||
|  | ||||
| fn multiword_rewrite_matches( | ||||
|     mut matches: Vec<(DocumentId, TmpMatch)>, | ||||
|     query_enhancer: &QueryEnhancer, | ||||
| ) -> SetBuf<(DocumentId, TmpMatch)> { | ||||
|     let mut padded_matches = Vec::with_capacity(matches.len()); | ||||
|  | ||||
|     let before_sort = Instant::now(); | ||||
|     // we sort the matches by word index to make them rewritable | ||||
|     matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); | ||||
|     debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); | ||||
|  | ||||
|     let before_padding = Instant::now(); | ||||
|     // for each attribute of each document | ||||
|     for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { | ||||
|         // padding will only be applied | ||||
|         // to word indices in the same attribute | ||||
|         let mut padding = 0; | ||||
|         let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); | ||||
|  | ||||
|         // for each match at the same position | ||||
|         // in this document attribute | ||||
|         while let Some(same_word_index) = iter.next() { | ||||
|             // find the biggest padding | ||||
|             let mut biggest = 0; | ||||
|             for (id, match_) in same_word_index { | ||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index); | ||||
|                 let replacement_len = replacement.len(); | ||||
|                 let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); | ||||
|  | ||||
|                 if let Some(query_index) = replacement.next() { | ||||
|                     let word_index = match_.word_index + padding as u16; | ||||
|                     let match_ = TmpMatch { | ||||
|                         query_index, | ||||
|                         word_index, | ||||
|                         ..*match_ | ||||
|                     }; | ||||
|                     padded_matches.push((*id, match_)); | ||||
|                 } | ||||
|  | ||||
|                 let mut found = false; | ||||
|  | ||||
|                 // look ahead and if there already is a match | ||||
|                 // corresponding to this padding word, abort the padding | ||||
|                 'padding: for (x, next_group) in nexts.enumerate() { | ||||
|                     for (i, query_index) in replacement.clone().enumerate().skip(x) { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let padmatch = TmpMatch { | ||||
|                             query_index, | ||||
|                             word_index, | ||||
|                             ..*match_ | ||||
|                         }; | ||||
|  | ||||
|                         for (_, nmatch_) in next_group { | ||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index); | ||||
|                             let query_index = rep.next().unwrap(); | ||||
|                             if query_index == padmatch.query_index { | ||||
|                                 if !found { | ||||
|                                     // if we find a corresponding padding for the | ||||
|                                     // first time we must push preceding paddings | ||||
|                                     for (i, query_index) in replacement.clone().enumerate().take(i) | ||||
|                                     { | ||||
|                                         let word_index = | ||||
|                                             match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                                         let match_ = TmpMatch { | ||||
|                                             query_index, | ||||
|                                             word_index, | ||||
|                                             ..*match_ | ||||
|                                         }; | ||||
|                                         padded_matches.push((*id, match_)); | ||||
|                                         biggest = biggest.max(i + 1); | ||||
|                                     } | ||||
|                                 } | ||||
|  | ||||
|                                 padded_matches.push((*id, padmatch)); | ||||
|                                 found = true; | ||||
|                                 continue 'padding; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // if we do not find a corresponding padding in the | ||||
|                     // next groups so stop here and pad what was found | ||||
|                     break; | ||||
|                 } | ||||
|  | ||||
|                 if !found { | ||||
|                     // if no padding was found in the following matches | ||||
|                     // we must insert the entire padding | ||||
|                     for (i, query_index) in replacement.enumerate() { | ||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||
|                         let match_ = TmpMatch { | ||||
|                             query_index, | ||||
|                             word_index, | ||||
|                             ..*match_ | ||||
|                         }; | ||||
|                         padded_matches.push((*id, match_)); | ||||
|                     } | ||||
|  | ||||
|                     biggest = biggest.max(replacement_len - 1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             padding += biggest; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { | ||||
|         document_matches.sort_unstable(); | ||||
|     } | ||||
|  | ||||
|     debug!("padding matches took {:.02?}", before_padding.elapsed()); | ||||
|  | ||||
|     // With this check we can see that the loop above takes something | ||||
|     // like 43% of the search time even when no rewrite is needed. | ||||
|     // assert_eq!(before_matches, padded_matches); | ||||
|  | ||||
|     SetBuf::new_unchecked(padded_matches) | ||||
| } | ||||
|  | ||||
| fn fetch_raw_documents( | ||||
|     reader: &heed::RoTxn<MainT>, | ||||
|     automatons_groups: &[AutomatonGroup], | ||||
|     query_enhancer: &QueryEnhancer, | ||||
|     searchables: Option<&ReorderedAttrs>, | ||||
|     main_store: store::Main, | ||||
|     postings_lists_store: store::PostingsLists, | ||||
| ) -> MResult<Vec<RawDocument>> { | ||||
|     let mut matches = Vec::new(); | ||||
|     let mut highlights = Vec::new(); | ||||
|  | ||||
|     let words = match main_store.words_fst(reader)? { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(Vec::new()), | ||||
|     }; | ||||
|  | ||||
|     let before_automatons_groups_loop = Instant::now(); | ||||
|     let mut doc_indexes_rewrite = Duration::default(); | ||||
|     let mut retrieve_postings_lists = Duration::default(); | ||||
|     let mut stream_reserve = Duration::default(); | ||||
|     let mut covered_area_time = Duration::default(); | ||||
|     let mut eval_time = Duration::default(); | ||||
|  | ||||
|     for group in automatons_groups { | ||||
|         let AutomatonGroup { is_phrase_query, automatons } = group; | ||||
|         let phrase_query_len = automatons.len(); | ||||
|  | ||||
|         let mut tmp_matches = Vec::new(); | ||||
|         for (id, automaton) in automatons.into_iter().enumerate() { | ||||
|             let Automaton { index, is_exact, query_len, query, .. } = automaton; | ||||
|             let dfa = automaton.dfa(); | ||||
|  | ||||
|             let before_stream_loop = Instant::now(); | ||||
|             let mut stream_count = 0; | ||||
|  | ||||
|             let mut stream = words.search(&dfa).into_stream(); | ||||
|             while let Some(input) = stream.next() { | ||||
|                 let before_eval_time = Instant::now(); | ||||
|                 let distance = dfa.eval(input).to_u8(); | ||||
|                 eval_time += before_eval_time.elapsed(); | ||||
|  | ||||
|                 let is_exact = *is_exact && distance == 0 && input.len() == *query_len; | ||||
|  | ||||
|                 stream_count += 1; | ||||
|  | ||||
|                 let before_covered_area = Instant::now(); | ||||
|                 let covered_area = if *query_len > input.len() { | ||||
|                     input.len() | ||||
|                 } else { | ||||
|                     prefix_damerau_levenshtein(query.as_bytes(), input).1 | ||||
|                 }; | ||||
|                 covered_area_time += before_covered_area.elapsed(); | ||||
|  | ||||
|                 let before_retrieve_postings_lists = Instant::now(); | ||||
|                 let doc_indexes = match postings_lists_store.postings_list(reader, input)? { | ||||
|                     Some(doc_indexes) => doc_indexes, | ||||
|                     None => continue, | ||||
|                 }; | ||||
|                 retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); | ||||
|  | ||||
|                 let before_stream_reserve = Instant::now(); | ||||
|                 tmp_matches.reserve(doc_indexes.len()); | ||||
|                 stream_reserve += before_stream_reserve.elapsed(); | ||||
|  | ||||
|                 let before_doc_indexes_rewrite = Instant::now(); | ||||
|                 for di in doc_indexes.as_ref() { | ||||
|                     let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); | ||||
|                     if let Some(attribute) = attribute { | ||||
|                         let match_ = TmpMatch { | ||||
|                             query_index: *index as u32, | ||||
|                             distance, | ||||
|                             attribute, | ||||
|                             word_index: di.word_index, | ||||
|                             is_exact, | ||||
|                         }; | ||||
|  | ||||
|                         let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value()); | ||||
|                         let covered_area = cmp::min(covered_area, di.char_length); | ||||
|  | ||||
|                         let highlight = Highlight { | ||||
|                             attribute: di.attribute, | ||||
|                             char_index: di.char_index, | ||||
|                             char_length: covered_area, | ||||
|                         }; | ||||
|  | ||||
|                         tmp_matches.push((di.document_id, id, match_, highlight)); | ||||
|                     } | ||||
|                 } | ||||
|                 doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); | ||||
|             } | ||||
|             debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); | ||||
|         } | ||||
|  | ||||
|         if *is_phrase_query { | ||||
|             tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); | ||||
|             for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { | ||||
|                 for window in group.windows(2) { | ||||
|                     let (ida, ia, ma, ha) = window[0]; | ||||
|                     let (idb, ib, mb, hb) = window[1]; | ||||
|  | ||||
|                     debug_assert_eq!(ida, idb); | ||||
|  | ||||
|                     // if matches must follow and actually follows themselves | ||||
|                     if ia + 1 == ib && ma.word_index + 1 == mb.word_index { | ||||
|                         // TODO we must make it work for phrase query longer than 2 | ||||
|                         // if the second match is the last phrase query word | ||||
|                         if ib + 1 == phrase_query_len { | ||||
|                             // insert first match | ||||
|                             matches.push((ida, ma)); | ||||
|                             highlights.push((ida, ha)); | ||||
|  | ||||
|                             // insert second match | ||||
|                             matches.push((idb, mb)); | ||||
|                             highlights.push((idb, hb)); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } else { | ||||
|             let before_rerewrite = Instant::now(); | ||||
|  | ||||
|             matches.reserve(tmp_matches.len()); | ||||
|             highlights.reserve(tmp_matches.len()); | ||||
|  | ||||
|             for (id, _, match_, highlight) in tmp_matches { | ||||
|                 matches.push((id, match_)); | ||||
|                 highlights.push((id, highlight)); | ||||
|             } | ||||
|             debug!("rerewrite took {:.02?}", before_rerewrite.elapsed()); | ||||
|         } | ||||
|     } | ||||
|     debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); | ||||
|     debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); | ||||
|     debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); | ||||
|     debug!("stream reserve took {:.02?}", stream_reserve); | ||||
|     debug!("covered area took {:.02?}", covered_area_time); | ||||
|     debug!("eval value took {:.02?}", eval_time); | ||||
|  | ||||
|     // { | ||||
|     //     let mut cloned = matches.clone(); | ||||
|     //     let before_sort_test = Instant::now(); | ||||
|     //     cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); | ||||
|     //     debug!("sorting test took {:.02?}", before_sort_test.elapsed()); | ||||
|     // } | ||||
|  | ||||
|     let before_multiword_rewrite_matches = Instant::now(); | ||||
|     debug!("number of matches before rewrite {}", matches.len()); | ||||
|     debug!("{:?}", query_enhancer); | ||||
|     let matches = multiword_rewrite_matches(matches, &query_enhancer); | ||||
|     debug!("number of matches after rewrite {}", matches.len()); | ||||
|     debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); | ||||
|  | ||||
|     let before_highlight_sorting = Instant::now(); | ||||
|     let highlights = { | ||||
|         highlights.sort_unstable_by_key(|(id, _)| *id); | ||||
|         SetBuf::new_unchecked(highlights) | ||||
|     }; | ||||
|     debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); | ||||
|  | ||||
|     let before_raw_documents = Instant::now(); | ||||
|     let raw_documents = raw_documents_from(matches, highlights); | ||||
|     debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); | ||||
|     debug!("documents to worry about: {}", raw_documents.len()); | ||||
|  | ||||
|     Ok(raw_documents) | ||||
| } | ||||
|  | ||||
| impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | ||||
|     pub fn new( | ||||
|         main: store::Main, | ||||
| @@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | ||||
|                 reader, | ||||
|                 query, | ||||
|                 range, | ||||
|                 // self.criteria, | ||||
|                 self.criteria, | ||||
|                 self.main_store, | ||||
|                 self.postings_lists_store, | ||||
|                 self.documents_fields_counts_store, | ||||
|   | ||||
| @@ -1,183 +1,89 @@ | ||||
| use std::fmt; | ||||
| use std::sync::Arc; | ||||
|  | ||||
| use compact_arena::SmallArena; | ||||
| use itertools::EitherOrBoth; | ||||
| use sdset::SetBuf; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; | ||||
| use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct RawDocument { | ||||
|     pub id: DocumentId, | ||||
|     pub matches: SharedMatches, | ||||
|     pub highlights: Vec<Highlight>, | ||||
|     pub fields_counts: Option<SetBuf<AttrCount>>, | ||||
| pub struct RawDocument<'a, 'tag> { | ||||
|     pub id: crate::DocumentId, | ||||
|     pub raw_matches: &'a mut [BareMatch<'tag>], | ||||
|     pub processed_matches: Vec<SimpleMatch>, | ||||
|     /// The list of minimum `distance` found | ||||
|     pub processed_distances: Vec<Option<u8>>, | ||||
| } | ||||
|  | ||||
| impl RawDocument { | ||||
|     pub fn query_index(&self) -> &[u32] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { | ||||
|             &self | ||||
|                 .matches | ||||
|                 .matches | ||||
|                 .query_index | ||||
|                 .get_unchecked(r.start..r.end) | ||||
| impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||
|     pub fn new<'txn>( | ||||
|         raw_matches: &'a mut [BareMatch<'tag>], | ||||
|         automatons: &[QueryWordAutomaton], | ||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||
|     ) -> Option<RawDocument<'a, 'tag>> | ||||
|     { | ||||
|         raw_matches.sort_unstable_by_key(|m| m.query_index); | ||||
|  | ||||
|         let mut previous_word = None; | ||||
|         for i in 0..raw_matches.len() { | ||||
|             let a = &raw_matches[i]; | ||||
|             let auta = &automatons[a.query_index as usize]; | ||||
|  | ||||
|             match auta.phrase_query { | ||||
|                 Some((0, _)) => { | ||||
|                     let b = match raw_matches.get(i + 1) { | ||||
|                         Some(b) => b, | ||||
|                         None => { | ||||
|                             postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                             continue; | ||||
|                         } | ||||
|                     }; | ||||
|  | ||||
|                     if a.query_index + 1 != b.query_index { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                         continue | ||||
|                     } | ||||
|  | ||||
|                     let pla = &postings_lists[a.postings_list]; | ||||
|                     let plb = &postings_lists[b.postings_list]; | ||||
|  | ||||
|                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||
|                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||
|                     }); | ||||
|  | ||||
|                     let mut newa = Vec::new(); | ||||
|                     let mut newb = Vec::new(); | ||||
|  | ||||
|                     for eb in iter { | ||||
|                         if let EitherOrBoth::Both(a, b) = eb { | ||||
|                             newa.push(*a); | ||||
|                             newb.push(*b); | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     if !newa.is_empty() { | ||||
|                         previous_word = Some(a.query_index); | ||||
|                     } | ||||
|  | ||||
|                     postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); | ||||
|                     postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); | ||||
|                 }, | ||||
|                 Some((1, _)) => { | ||||
|                     if previous_word.take() != Some(a.query_index - 1) { | ||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||
|                     } | ||||
|                 }, | ||||
|                 Some((_, _)) => unreachable!(), | ||||
|                 None => (), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn distance(&self) -> &[u8] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn attribute(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn word_index(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { | ||||
|             &self | ||||
|                 .matches | ||||
|                 .matches | ||||
|                 .word_index | ||||
|                 .get_unchecked(r.start..r.end) | ||||
|         if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { | ||||
|             return None | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn is_exact(&self) -> &[bool] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl fmt::Debug for RawDocument { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         f.write_str("RawDocument {\r\n")?; | ||||
|         f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; | ||||
|         f.write_fmt(format_args!( | ||||
|             "{:>15}: {:^5?},\r\n", | ||||
|             "query_index", | ||||
|             self.query_index() | ||||
|         ))?; | ||||
|         f.write_fmt(format_args!( | ||||
|             "{:>15}: {:^5?},\r\n", | ||||
|             "distance", | ||||
|             self.distance() | ||||
|         ))?; | ||||
|         f.write_fmt(format_args!( | ||||
|             "{:>15}: {:^5?},\r\n", | ||||
|             "attribute", | ||||
|             self.attribute() | ||||
|         ))?; | ||||
|         f.write_fmt(format_args!( | ||||
|             "{:>15}: {:^5?},\r\n", | ||||
|             "word_index", | ||||
|             self.word_index() | ||||
|         ))?; | ||||
|         f.write_fmt(format_args!( | ||||
|             "{:>15}: {:^5?},\r\n", | ||||
|             "is_exact", | ||||
|             self.is_exact() | ||||
|         ))?; | ||||
|         f.write_str("}")?; | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn raw_documents_from( | ||||
|     matches: SetBuf<(DocumentId, TmpMatch)>, | ||||
|     highlights: SetBuf<(DocumentId, Highlight)> | ||||
| ) -> Vec<RawDocument> { | ||||
|     let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); | ||||
|     let mut matches2 = Matches::with_capacity(matches.len()); | ||||
|  | ||||
|     let matches = matches.linear_group_by_key(|(id, _)| *id); | ||||
|     let highlights = highlights.linear_group_by_key(|(id, _)| *id); | ||||
|  | ||||
|     for (mgroup, hgroup) in matches.zip(highlights) { | ||||
|         assert_eq!(mgroup[0].0, hgroup[0].0); | ||||
|  | ||||
|         let document_id = mgroup[0].0; | ||||
|         let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); | ||||
|         let end = start + mgroup.len(); | ||||
|         let highlights = hgroup.iter().map(|(_, h)| *h).collect(); | ||||
|         let fields_counts = None; | ||||
|  | ||||
|         docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); | ||||
|         // TODO we could try to keep both data | ||||
|         //  - the data oriented one and, | ||||
|         //  - the raw one, the one that comes from the arguments of this function | ||||
|         // This way we would be able to only produce data oriented lazily. | ||||
|         // | ||||
|         // For example the default first criterion is `SumOfTypos` | ||||
|         // and just needs the `query_index` and the `distance` fields. | ||||
|         // It would probably be good to avoid wasting time sorting other fields of documents | ||||
|         // that will never ever reach the second criterion. | ||||
|         matches2.extend_from_slice(mgroup); | ||||
|     } | ||||
|  | ||||
|     let matches = Arc::new(matches2); | ||||
|     docs_ranges | ||||
|         .into_iter() | ||||
|         .map(|(id, range, highlights, fields_counts)| { | ||||
|             let matches = SharedMatches { range, matches: matches.clone() }; | ||||
|             RawDocument { id, matches, highlights, fields_counts } | ||||
|         Some(RawDocument { | ||||
|             id: raw_matches[0].document_id, | ||||
|             raw_matches, | ||||
|             processed_matches: Vec::new(), | ||||
|             processed_distances: Vec::new(), | ||||
|         }) | ||||
|         .collect() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| struct Range { | ||||
|     start: usize, | ||||
|     end: usize, | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct SharedMatches { | ||||
|     range: Range, | ||||
|     matches: Arc<Matches>, | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| struct Matches { | ||||
|     query_index: Vec<u32>, | ||||
|     distance: Vec<u8>, | ||||
|     attribute: Vec<u16>, | ||||
|     word_index: Vec<u16>, | ||||
|     is_exact: Vec<bool>, | ||||
| } | ||||
|  | ||||
| impl Matches { | ||||
|     fn with_capacity(cap: usize) -> Matches { | ||||
|         Matches { | ||||
|             query_index: Vec::with_capacity(cap), | ||||
|             distance: Vec::with_capacity(cap), | ||||
|             attribute: Vec::with_capacity(cap), | ||||
|             word_index: Vec::with_capacity(cap), | ||||
|             is_exact: Vec::with_capacity(cap), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { | ||||
|         for (_, match_) in matches { | ||||
|             self.query_index.push(match_.query_index); | ||||
|             self.distance.push(match_.distance); | ||||
|             self.attribute.push(match_.attribute); | ||||
|             self.word_index.push(match_.word_index); | ||||
|             self.is_exact.push(match_.is_exact); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> { | ||||
|             if let Some(ranking_rules_order) = ranking_order { | ||||
|                 for rule in ranking_rules_order { | ||||
|                     match rule.as_str() { | ||||
|                         "_sum_of_typos" => builder.push(SumOfTypos), | ||||
|                         "_number_of_words" => builder.push(NumberOfWords), | ||||
|                         "_word_proximity" => builder.push(WordsProximity), | ||||
|                         "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), | ||||
|                         "_sum_of_words_position" => builder.push(SumOfWordsPosition), | ||||
|                         "_typo" => builder.push(Typo), | ||||
|                         "_words" => builder.push(Words), | ||||
|                         "_proximity" => builder.push(Proximity), | ||||
|                         "_attribute" => builder.push(Attribute), | ||||
|                         "_words_position" => builder.push(WordsPosition), | ||||
|                         "_exact" => builder.push(Exact), | ||||
|                         _ => { | ||||
|                             let order = match ranking_rules.get(rule.as_str()) { | ||||
| @@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> { | ||||
|                 builder.push(DocumentId); | ||||
|                 return Ok(Some(builder.build())); | ||||
|             } else { | ||||
|                 builder.push(SumOfTypos); | ||||
|                 builder.push(NumberOfWords); | ||||
|                 builder.push(WordsProximity); | ||||
|                 builder.push(SumOfWordsAttribute); | ||||
|                 builder.push(SumOfWordsPosition); | ||||
|                 builder.push(Typo); | ||||
|                 builder.push(Words); | ||||
|                 builder.push(Proximity); | ||||
|                 builder.push(Attribute); | ||||
|                 builder.push(WordsPosition); | ||||
|                 builder.push(Exact); | ||||
|                 for (rule, order) in ranking_rules.iter() { | ||||
|                     let custom_ranking = match order { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user