mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	Update the criteria to the new ones
This commit is contained in:
		| @@ -1,9 +1,6 @@ | |||||||
| use std::ops::Deref; | use std::ops::Deref; | ||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::cmp::Ordering; |  | ||||||
| use std::collections::HashSet; |  | ||||||
| use std::io::Write; |  | ||||||
| use std::mem; | use std::mem; | ||||||
| use std::ops::Range; | use std::ops::Range; | ||||||
| use std::rc::Rc; | use std::rc::Rc; | ||||||
| @@ -17,15 +14,15 @@ use meilisearch_tokenizer::{is_cjk, split_query_string}; | |||||||
| use meilisearch_types::{DocIndex, Highlight}; | use meilisearch_types::{DocIndex, Highlight}; | ||||||
| use sdset::{Set, SetBuf}; | use sdset::{Set, SetBuf}; | ||||||
| use slice_group_by::{GroupBy, GroupByMut}; | use slice_group_by::{GroupBy, GroupByMut}; | ||||||
| use itertools::EitherOrBoth; |  | ||||||
|  |  | ||||||
| use crate::automaton::NGRAMS; | use crate::automaton::NGRAMS; | ||||||
| use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; | use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; | ||||||
| use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||||
| use crate::automaton::{normalize_str, split_best_frequency}; | use crate::automaton::{normalize_str, split_best_frequency}; | ||||||
|  |  | ||||||
| use crate::criterion2::*; | use crate::criterion::Criteria; | ||||||
| use crate::levenshtein::prefix_damerau_levenshtein; | use crate::levenshtein::prefix_damerau_levenshtein; | ||||||
|  | use crate::raw_document::RawDocument; | ||||||
| use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; | use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; | ||||||
| use crate::{store, Document, DocumentId, MResult}; | use crate::{store, Document, DocumentId, MResult}; | ||||||
|  |  | ||||||
| @@ -33,6 +30,7 @@ pub fn bucket_sort<'c>( | |||||||
|     reader: &heed::RoTxn<MainT>, |     reader: &heed::RoTxn<MainT>, | ||||||
|     query: &str, |     query: &str, | ||||||
|     range: Range<usize>, |     range: Range<usize>, | ||||||
|  |     criteria: Criteria<'c>, | ||||||
|     main_store: store::Main, |     main_store: store::Main, | ||||||
|     postings_lists_store: store::PostingsLists, |     postings_lists_store: store::PostingsLists, | ||||||
|     documents_fields_counts_store: store::DocumentsFieldsCounts, |     documents_fields_counts_store: store::DocumentsFieldsCounts, | ||||||
| @@ -76,17 +74,7 @@ pub fn bucket_sort<'c>( | |||||||
|  |  | ||||||
|     let mut groups = vec![raw_documents.as_mut_slice()]; |     let mut groups = vec![raw_documents.as_mut_slice()]; | ||||||
|  |  | ||||||
|     let criteria = [ |     'criteria: for criterion in criteria.as_ref() { | ||||||
|         Box::new(Typo) as Box<dyn Criterion>, |  | ||||||
|         Box::new(Words), |  | ||||||
|         Box::new(Proximity), |  | ||||||
|         Box::new(Attribute), |  | ||||||
|         Box::new(WordsPosition), |  | ||||||
|         Box::new(Exact), |  | ||||||
|         Box::new(StableDocId), |  | ||||||
|     ]; |  | ||||||
|  |  | ||||||
|     'criteria: for criterion in &criteria { |  | ||||||
|         let tmp_groups = mem::replace(&mut groups, Vec::new()); |         let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||||
|         let mut documents_seen = 0; |         let mut documents_seen = 0; | ||||||
|  |  | ||||||
| @@ -131,7 +119,7 @@ pub fn bucket_sort<'c>( | |||||||
|         }).collect(); |         }).collect(); | ||||||
|  |  | ||||||
|         Document { |         Document { | ||||||
|             id: d.raw_matches[0].document_id, |             id: d.id, | ||||||
|             highlights, |             highlights, | ||||||
|             #[cfg(test)] matches: Vec::new(), |             #[cfg(test)] matches: Vec::new(), | ||||||
|         } |         } | ||||||
| @@ -140,88 +128,6 @@ pub fn bucket_sort<'c>( | |||||||
|     Ok(iter.collect()) |     Ok(iter.collect()) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct RawDocument<'a, 'tag> { |  | ||||||
|     pub raw_matches: &'a mut [BareMatch<'tag>], |  | ||||||
|     pub processed_matches: Vec<SimpleMatch>, |  | ||||||
|     /// The list of minimum `distance` found |  | ||||||
|     pub processed_distances: Vec<Option<u8>>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a, 'tag> RawDocument<'a, 'tag> { |  | ||||||
|     fn new<'txn>( |  | ||||||
|         raw_matches: &'a mut [BareMatch<'tag>], |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, |  | ||||||
|     ) -> Option<RawDocument<'a, 'tag>> |  | ||||||
|     { |  | ||||||
|         raw_matches.sort_unstable_by_key(|m| m.query_index); |  | ||||||
|  |  | ||||||
|         let mut previous_word = None; |  | ||||||
|         for i in 0..raw_matches.len() { |  | ||||||
|             let a = &raw_matches[i]; |  | ||||||
|             let auta = &automatons[a.query_index as usize]; |  | ||||||
|  |  | ||||||
|             match auta.phrase_query { |  | ||||||
|                 Some((0, _)) => { |  | ||||||
|                     let b = match raw_matches.get(i + 1) { |  | ||||||
|                         Some(b) => b, |  | ||||||
|                         None => { |  | ||||||
|                             postings_lists[a.postings_list].rewrite_with(SetBuf::default()); |  | ||||||
|                             continue; |  | ||||||
|                         } |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     if a.query_index + 1 != b.query_index { |  | ||||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); |  | ||||||
|                         continue |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     let pla = &postings_lists[a.postings_list]; |  | ||||||
|                     let plb = &postings_lists[b.postings_list]; |  | ||||||
|  |  | ||||||
|                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { |  | ||||||
|                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) |  | ||||||
|                     }); |  | ||||||
|  |  | ||||||
|                     let mut newa = Vec::new(); |  | ||||||
|                     let mut newb = Vec::new(); |  | ||||||
|  |  | ||||||
|                     for eb in iter { |  | ||||||
|                         if let EitherOrBoth::Both(a, b) = eb { |  | ||||||
|                             newa.push(*a); |  | ||||||
|                             newb.push(*b); |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     if !newa.is_empty() { |  | ||||||
|                         previous_word = Some(a.query_index); |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); |  | ||||||
|                     postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); |  | ||||||
|                 }, |  | ||||||
|                 Some((1, _)) => { |  | ||||||
|                     if previous_word.take() != Some(a.query_index - 1) { |  | ||||||
|                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); |  | ||||||
|                     } |  | ||||||
|                 }, |  | ||||||
|                 Some((_, _)) => unreachable!(), |  | ||||||
|                 None => (), |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { |  | ||||||
|             return None |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Some(RawDocument { |  | ||||||
|             raw_matches, |  | ||||||
|             processed_matches: Vec::new(), |  | ||||||
|             processed_distances: Vec::new(), |  | ||||||
|         }) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct BareMatch<'tag> { | pub struct BareMatch<'tag> { | ||||||
|     pub document_id: DocumentId, |     pub document_id: DocumentId, | ||||||
|     pub query_index: u16, |     pub query_index: u16, | ||||||
|   | |||||||
							
								
								
									
										48
									
								
								meilisearch-core/src/criterion/attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								meilisearch-core/src/criterion/attribute.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | use std::cmp::{self, Ordering}; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  |  | ||||||
|  | use super::{Criterion, prepare_raw_matches}; | ||||||
|  |  | ||||||
|  | pub struct Attribute; | ||||||
|  |  | ||||||
|  | impl Criterion for Attribute { | ||||||
|  |     fn name(&self) -> &str { "attribute" } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument<'a, 'tag>, | ||||||
|  |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         #[inline] | ||||||
|  |         fn best_attribute(matches: &[SimpleMatch]) -> u16 { | ||||||
|  |             let mut best_attribute = u16::max_value(); | ||||||
|  |             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||||
|  |                 best_attribute = cmp::min(best_attribute, group[0].attribute); | ||||||
|  |             } | ||||||
|  |             best_attribute | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = best_attribute(&lhs.processed_matches); | ||||||
|  |         let rhs = best_attribute(&rhs.processed_matches); | ||||||
|  |  | ||||||
|  |         lhs.cmp(&rhs) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,16 +1,37 @@ | |||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] | use compact_arena::SmallArena; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  | use super::Criterion; | ||||||
|  |  | ||||||
| pub struct DocumentId; | pub struct DocumentId; | ||||||
|  |  | ||||||
| impl Criterion for DocumentId { | impl Criterion for DocumentId { | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |     fn name(&self) -> &str { "stable document id" } | ||||||
|         lhs.id.cmp(&rhs.id) |  | ||||||
|  |     fn prepare( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument], | ||||||
|  |         postings_lists: &mut SmallArena<PostingsListView>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         // ... | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |     fn evaluate( | ||||||
|         "DocumentId" |         &self, | ||||||
|  |         lhs: &RawDocument, | ||||||
|  |         rhs: &RawDocument, | ||||||
|  |         postings_lists: &SmallArena<PostingsListView>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         let lhs = &lhs.id; | ||||||
|  |         let rhs = &rhs.id; | ||||||
|  |  | ||||||
|  |         lhs.cmp(rhs) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,131 +1,51 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::{Ordering, Reverse}; | ||||||
|  |  | ||||||
| use sdset::Set; | use compact_arena::SmallArena; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
| use crate::criterion::Criterion; | use crate::automaton::QueryEnhancer; | ||||||
| use crate::{AttrCount, RawDocument}; | use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  | use super::Criterion; | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn number_exact_matches( |  | ||||||
|     query_index: &[u32], |  | ||||||
|     attribute: &[u16], |  | ||||||
|     is_exact: &[bool], |  | ||||||
|     fields_counts: &Set<AttrCount>, |  | ||||||
| ) -> usize { |  | ||||||
|     let mut count = 0; |  | ||||||
|     let mut index = 0; |  | ||||||
|  |  | ||||||
|     for group in query_index.linear_group() { |  | ||||||
|         let len = group.len(); |  | ||||||
|  |  | ||||||
|         let mut found_exact = false; |  | ||||||
|         for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { |  | ||||||
|             if *is_exact { |  | ||||||
|                 found_exact = true; |  | ||||||
|                 let attr = &attribute[index + pos]; |  | ||||||
|                 if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { |  | ||||||
|                     let AttrCount { count, .. } = fields_counts[pos]; |  | ||||||
|                     if count == 1 { |  | ||||||
|                         return usize::max_value(); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         count += found_exact as usize; |  | ||||||
|         index += len; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     count |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct Exact; | pub struct Exact; | ||||||
|  |  | ||||||
| impl Criterion for Exact { | impl Criterion for Exact { | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |     fn name(&self) -> &str { "exact" } | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             let is_exact = lhs.is_exact(); |  | ||||||
|             let attribute = lhs.attribute(); |  | ||||||
|             let fields_counts = lhs.fields_counts.as_ref().unwrap(); |  | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |     fn prepare( | ||||||
|         }; |         &self, | ||||||
|  |         documents: &mut [RawDocument], | ||||||
|  |         postings_lists: &mut SmallArena<PostingsListView>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         for document in documents { | ||||||
|  |             document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|         let rhs = { |     fn evaluate( | ||||||
|             let query_index = rhs.query_index(); |         &self, | ||||||
|             let is_exact = rhs.is_exact(); |         lhs: &RawDocument, | ||||||
|             let attribute = rhs.attribute(); |         rhs: &RawDocument, | ||||||
|             let fields_counts = rhs.fields_counts.as_ref().unwrap(); |         postings_lists: &SmallArena<PostingsListView>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         #[inline] | ||||||
|  |         fn sum_exact_query_words(matches: &[BareMatch]) -> usize { | ||||||
|  |             let mut sum_exact_query_words = 0; | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||||
|         }; |                 sum_exact_query_words += group[0].is_exact as usize; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             sum_exact_query_words | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = sum_exact_query_words(&lhs.raw_matches); | ||||||
|  |         let rhs = sum_exact_query_words(&rhs.raw_matches); | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |         lhs.cmp(&rhs).reverse() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "Exact" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     // typing: "soulier" |  | ||||||
|     // |  | ||||||
|     // doc0: "Soulier bleu" |  | ||||||
|     // doc1: "souliereres rouge" |  | ||||||
|     #[test] |  | ||||||
|     fn easy_case() { |  | ||||||
|         let doc0 = { |  | ||||||
|             let query_index = &[0]; |  | ||||||
|             let attribute = &[0]; |  | ||||||
|             let is_exact = &[true]; |  | ||||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); |  | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let doc1 = { |  | ||||||
|             let query_index = &[0]; |  | ||||||
|             let attribute = &[0]; |  | ||||||
|             let is_exact = &[false]; |  | ||||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); |  | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // typing: "soulier" |  | ||||||
|     // |  | ||||||
|     // doc0: { 0. "soulier" } |  | ||||||
|     // doc1: { 0. "soulier bleu et blanc" } |  | ||||||
|     #[test] |  | ||||||
|     fn basic() { |  | ||||||
|         let doc0 = { |  | ||||||
|             let query_index = &[0]; |  | ||||||
|             let attribute = &[0]; |  | ||||||
|             let is_exact = &[true]; |  | ||||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); |  | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let doc1 = { |  | ||||||
|             let query_index = &[0]; |  | ||||||
|             let attribute = &[0]; |  | ||||||
|             let is_exact = &[true]; |  | ||||||
|             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); |  | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,58 +1,58 @@ | |||||||
| mod document_id; | use std::cmp::{self, Ordering}; | ||||||
| mod exact; |  | ||||||
| mod number_of_words; |  | ||||||
| mod sort_by_attr; |  | ||||||
| mod sum_of_typos; |  | ||||||
| mod sum_of_words_attribute; |  | ||||||
| mod sum_of_words_position; |  | ||||||
| mod words_proximity; |  | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  | use sdset::SetBuf; | ||||||
|  | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; | ||||||
| use crate::RawDocument; | use crate::RawDocument; | ||||||
| use std::cmp::Ordering; |  | ||||||
|  |  | ||||||
| pub use self::{ | mod typo; | ||||||
|     document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, | mod words; | ||||||
|     sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, | mod proximity; | ||||||
|     sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, | mod attribute; | ||||||
|     words_proximity::WordsProximity, | mod words_position; | ||||||
| }; | mod exact; | ||||||
|  | mod document_id; | ||||||
|  | mod sort_by_attr; | ||||||
|  |  | ||||||
| pub trait Criterion: Send + Sync { | pub use self::typo::Typo; | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; | pub use self::words::Words; | ||||||
|  | pub use self::proximity::Proximity; | ||||||
|  | pub use self::attribute::Attribute; | ||||||
|  | pub use self::words_position::WordsPosition; | ||||||
|  | pub use self::exact::Exact; | ||||||
|  | pub use self::document_id::DocumentId; | ||||||
|  | pub use self::sort_by_attr::SortByAttr; | ||||||
|  |  | ||||||
|  | pub trait Criterion { | ||||||
|     fn name(&self) -> &str; |     fn name(&self) -> &str; | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     fn evaluate<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument<'a, 'tag>, | ||||||
|  |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Ordering; | ||||||
|  |  | ||||||
|     #[inline] |     #[inline] | ||||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { |     fn eq<'a, 'tag, 'txn>( | ||||||
|         self.evaluate(lhs, rhs) == Ordering::Equal |         &self, | ||||||
|     } |         lhs: &RawDocument<'a, 'tag>, | ||||||
| } |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
| impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { |     ) -> bool | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |     { | ||||||
|         (**self).evaluate(lhs, rhs) |         self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         (**self).name() |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { |  | ||||||
|         (**self).eq(lhs, rhs) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<T: Criterion + ?Sized> Criterion for Box<T> { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         (**self).evaluate(lhs, rhs) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         (**self).name() |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { |  | ||||||
|         (**self).eq(lhs, rhs) |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -103,11 +103,11 @@ pub struct Criteria<'a> { | |||||||
| impl<'a> Default for Criteria<'a> { | impl<'a> Default for Criteria<'a> { | ||||||
|     fn default() -> Self { |     fn default() -> Self { | ||||||
|         CriteriaBuilder::with_capacity(7) |         CriteriaBuilder::with_capacity(7) | ||||||
|             .add(SumOfTypos) |             .add(Typo) | ||||||
|             .add(NumberOfWords) |             .add(Words) | ||||||
|             .add(WordsProximity) |             .add(Proximity) | ||||||
|             .add(SumOfWordsAttribute) |             .add(Attribute) | ||||||
|             .add(SumOfWordsPosition) |             .add(WordsPosition) | ||||||
|             .add(Exact) |             .add(Exact) | ||||||
|             .add(DocumentId) |             .add(DocumentId) | ||||||
|             .build() |             .build() | ||||||
| @@ -119,3 +119,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> { | |||||||
|         &self.inner |         &self.inner | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn prepare_query_distances<'a, 'tag, 'txn>( | ||||||
|  |     documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |     query_enhancer: &QueryEnhancer, | ||||||
|  |     automatons: &[QueryWordAutomaton], | ||||||
|  |     postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  | ) { | ||||||
|  |     for document in documents { | ||||||
|  |         if !document.processed_distances.is_empty() { continue } | ||||||
|  |  | ||||||
|  |         let mut processed = Vec::new(); | ||||||
|  |         for m in document.raw_matches.iter() { | ||||||
|  |             if postings_lists[m.postings_list].is_empty() { continue } | ||||||
|  |  | ||||||
|  |             let range = query_enhancer.replacement(m.query_index as u32); | ||||||
|  |             let new_len = cmp::max(range.end as usize, processed.len()); | ||||||
|  |             processed.resize(new_len, None); | ||||||
|  |  | ||||||
|  |             for index in range { | ||||||
|  |                 let index = index as usize; | ||||||
|  |                 processed[index] = match processed[index] { | ||||||
|  |                     Some(distance) if distance > m.distance => Some(m.distance), | ||||||
|  |                     Some(distance) => Some(distance), | ||||||
|  |                     None => Some(m.distance), | ||||||
|  |                 }; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         document.processed_distances = processed; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn prepare_raw_matches<'a, 'tag, 'txn>( | ||||||
|  |     documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |     postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     query_enhancer: &QueryEnhancer, | ||||||
|  |     automatons: &[QueryWordAutomaton], | ||||||
|  | ) { | ||||||
|  |     for document in documents { | ||||||
|  |         if !document.processed_matches.is_empty() { continue } | ||||||
|  |  | ||||||
|  |         let mut processed = Vec::new(); | ||||||
|  |         for m in document.raw_matches.iter() { | ||||||
|  |             let postings_list = &postings_lists[m.postings_list]; | ||||||
|  |             processed.reserve(postings_list.len()); | ||||||
|  |             for di in postings_list.as_ref() { | ||||||
|  |                 let simple_match = SimpleMatch { | ||||||
|  |                     query_index: m.query_index, | ||||||
|  |                     distance: m.distance, | ||||||
|  |                     attribute: di.attribute, | ||||||
|  |                     word_index: di.word_index, | ||||||
|  |                     is_exact: m.is_exact, | ||||||
|  |                 }; | ||||||
|  |                 processed.push(simple_match); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); | ||||||
|  |         document.processed_matches = processed.into_vec(); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn multiword_rewrite_matches( | ||||||
|  |     matches: &mut [SimpleMatch], | ||||||
|  |     query_enhancer: &QueryEnhancer, | ||||||
|  |     automatons: &[QueryWordAutomaton], | ||||||
|  | ) -> SetBuf<SimpleMatch> | ||||||
|  | { | ||||||
|  |     matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); | ||||||
|  |  | ||||||
|  |     let mut padded_matches = Vec::with_capacity(matches.len()); | ||||||
|  |  | ||||||
|  |     // let before_padding = Instant::now(); | ||||||
|  |     // for each attribute of each document | ||||||
|  |     for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { | ||||||
|  |         // padding will only be applied | ||||||
|  |         // to word indices in the same attribute | ||||||
|  |         let mut padding = 0; | ||||||
|  |         let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); | ||||||
|  |  | ||||||
|  |         // for each match at the same position | ||||||
|  |         // in this document attribute | ||||||
|  |         while let Some(same_word_index) = iter.next() { | ||||||
|  |             // find the biggest padding | ||||||
|  |             let mut biggest = 0; | ||||||
|  |             for match_ in same_word_index { | ||||||
|  |                 let mut replacement = query_enhancer.replacement(match_.query_index as u32); | ||||||
|  |                 let replacement_len = replacement.len(); | ||||||
|  |                 let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); | ||||||
|  |  | ||||||
|  |                 if let Some(query_index) = replacement.next() { | ||||||
|  |                     let word_index = match_.word_index + padding as u16; | ||||||
|  |                     let query_index = query_index as u16; | ||||||
|  |                     let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||||
|  |                     padded_matches.push(match_); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 let mut found = false; | ||||||
|  |  | ||||||
|  |                 // look ahead and if there already is a match | ||||||
|  |                 // corresponding to this padding word, abort the padding | ||||||
|  |                 'padding: for (x, next_group) in nexts.enumerate() { | ||||||
|  |                     for (i, query_index) in replacement.clone().enumerate().skip(x) { | ||||||
|  |                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||||
|  |                         let query_index = query_index as u16; | ||||||
|  |                         let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; | ||||||
|  |  | ||||||
|  |                         for nmatch_ in next_group { | ||||||
|  |                             let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); | ||||||
|  |                             let query_index = rep.next().unwrap() as u16; | ||||||
|  |                             if query_index == padmatch.query_index { | ||||||
|  |                                 if !found { | ||||||
|  |                                     // if we find a corresponding padding for the | ||||||
|  |                                     // first time we must push preceding paddings | ||||||
|  |                                     for (i, query_index) in replacement.clone().enumerate().take(i) | ||||||
|  |                                     { | ||||||
|  |                                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||||
|  |                                         let query_index = query_index as u16; | ||||||
|  |                                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||||
|  |                                         padded_matches.push(match_); | ||||||
|  |                                         biggest = biggest.max(i + 1); | ||||||
|  |                                     } | ||||||
|  |                                 } | ||||||
|  |  | ||||||
|  |                                 padded_matches.push(padmatch); | ||||||
|  |                                 found = true; | ||||||
|  |                                 continue 'padding; | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     // if we do not find a corresponding padding in the | ||||||
|  |                     // next groups so stop here and pad what was found | ||||||
|  |                     break; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 if !found { | ||||||
|  |                     // if no padding was found in the following matches | ||||||
|  |                     // we must insert the entire padding | ||||||
|  |                     for (i, query_index) in replacement.enumerate() { | ||||||
|  |                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; | ||||||
|  |                         let query_index = query_index as u16; | ||||||
|  |                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; | ||||||
|  |                         padded_matches.push(match_); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     biggest = biggest.max(replacement_len - 1); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             padding += biggest; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // debug!("padding matches took {:.02?}", before_padding.elapsed()); | ||||||
|  |  | ||||||
|  |     // With this check we can see that the loop above takes something | ||||||
|  |     // like 43% of the search time even when no rewrite is needed. | ||||||
|  |     // assert_eq!(before_matches, padded_matches); | ||||||
|  |  | ||||||
|  |     SetBuf::from_dirty(padded_matches) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,31 +0,0 @@ | |||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
| use slice_group_by::GroupBy; |  | ||||||
| use std::cmp::Ordering; |  | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn number_of_query_words(query_index: &[u32]) -> usize { |  | ||||||
|     query_index.linear_group().count() |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct NumberOfWords; |  | ||||||
|  |  | ||||||
| impl Criterion for NumberOfWords { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             number_of_query_words(query_index) |  | ||||||
|         }; |  | ||||||
|         let rhs = { |  | ||||||
|             let query_index = rhs.query_index(); |  | ||||||
|             number_of_query_words(query_index) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "NumberOfWords" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
							
								
								
									
										79
									
								
								meilisearch-core/src/criterion/proximity.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								meilisearch-core/src/criterion/proximity.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,79 @@ | |||||||
|  | use std::cmp::{self, Ordering}; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  |  | ||||||
|  | use super::{Criterion, prepare_raw_matches}; | ||||||
|  |  | ||||||
|  | pub struct Proximity; | ||||||
|  |  | ||||||
|  | impl Criterion for Proximity { | ||||||
|  |     fn name(&self) -> &str { "proximity" } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument<'a, 'tag>, | ||||||
|  |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         const MAX_DISTANCE: u16 = 8; | ||||||
|  |  | ||||||
|  |         fn index_proximity(lhs: u16, rhs: u16) -> u16 { | ||||||
|  |             if lhs < rhs { | ||||||
|  |                 cmp::min(rhs - lhs, MAX_DISTANCE) | ||||||
|  |             } else { | ||||||
|  |                 cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { | ||||||
|  |             if lhs.attribute != rhs.attribute { MAX_DISTANCE } | ||||||
|  |             else { index_proximity(lhs.word_index, rhs.word_index) } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { | ||||||
|  |             let mut min_prox = u16::max_value(); | ||||||
|  |             for a in lhs { | ||||||
|  |                 for b in rhs { | ||||||
|  |                     let prox = attribute_proximity(*a, *b); | ||||||
|  |                     min_prox = cmp::min(min_prox, prox); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             min_prox | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn matches_proximity(matches: &[SimpleMatch],) -> u16 { | ||||||
|  |             let mut proximity = 0; | ||||||
|  |             let mut iter = matches.linear_group_by_key(|m| m.query_index); | ||||||
|  |  | ||||||
|  |             // iterate over groups by windows of size 2 | ||||||
|  |             let mut last = iter.next(); | ||||||
|  |             while let (Some(lhs), Some(rhs)) = (last, iter.next()) { | ||||||
|  |                 proximity += min_proximity(lhs, rhs); | ||||||
|  |                 last = Some(rhs); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             proximity | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = matches_proximity(&lhs.processed_matches); | ||||||
|  |         let rhs = matches_proximity(&rhs.processed_matches); | ||||||
|  |  | ||||||
|  |         lhs.cmp(&rhs) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -2,9 +2,13 @@ use std::cmp::Ordering; | |||||||
| use std::error::Error; | use std::error::Error; | ||||||
| use std::fmt; | use std::fmt; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  | use meilisearch_schema::{Schema, SchemaAttr}; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||||
| use crate::criterion::Criterion; | use crate::criterion::Criterion; | ||||||
| use crate::{RankedMap, RawDocument}; | use crate::{RankedMap, RawDocument}; | ||||||
| use meilisearch_schema::{Schema, SchemaAttr}; |  | ||||||
|  |  | ||||||
| /// An helper struct that permit to sort documents by | /// An helper struct that permit to sort documents by | ||||||
| /// some of their stored attributes. | /// some of their stored attributes. | ||||||
| @@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr}; | |||||||
| /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; | /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; | ||||||
| /// | /// | ||||||
| /// let builder = CriteriaBuilder::with_capacity(8) | /// let builder = CriteriaBuilder::with_capacity(8) | ||||||
| ///        .add(SumOfTypos) | ///        .add(Typo) | ||||||
| ///        .add(NumberOfWords) | ///        .add(Words) | ||||||
| ///        .add(WordsProximity) | ///        .add(Proximity) | ||||||
| ///        .add(SumOfWordsAttribute) | ///        .add(Attribute) | ||||||
| ///        .add(SumOfWordsPosition) | ///        .add(WordsPosition) | ||||||
| ///        .add(Exact) | ///        .add(Exact) | ||||||
| ///        .add(custom_ranking) | ///        .add(custom_ranking) | ||||||
| ///        .add(DocumentId); | ///        .add(DocumentId); | ||||||
| @@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> Criterion for SortByAttr<'a> { | impl Criterion for SortByAttr<'_> { | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |     fn name(&self) -> &str { | ||||||
|  |         "sort by attribute" | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         // ... | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument<'a, 'tag>, | ||||||
|  |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|         let lhs = self.ranked_map.get(lhs.id, self.attr); |         let lhs = self.ranked_map.get(lhs.id, self.attr); | ||||||
|         let rhs = self.ranked_map.get(rhs.id, self.attr); |         let rhs = self.ranked_map.get(rhs.id, self.attr); | ||||||
|  |  | ||||||
| @@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> { | |||||||
|             (None, None) => Ordering::Equal, |             (None, None) => Ordering::Equal, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "SortByAttr" |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||||||
|   | |||||||
| @@ -1,116 +0,0 @@ | |||||||
| use std::cmp::Ordering; |  | ||||||
|  |  | ||||||
| use slice_group_by::GroupBy; |  | ||||||
|  |  | ||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
|  |  | ||||||
| // This function is a wrong logarithmic 10 function. |  | ||||||
| // It is safe to panic on input number higher than 3, |  | ||||||
| // the number of typos is never bigger than that. |  | ||||||
| #[inline] |  | ||||||
| fn custom_log10(n: u8) -> f32 { |  | ||||||
|     match n { |  | ||||||
|         0 => 0.0,     // log(1) |  | ||||||
|         1 => 0.30102, // log(2) |  | ||||||
|         2 => 0.47712, // log(3) |  | ||||||
|         3 => 0.60205, // log(4) |  | ||||||
|         _ => panic!("invalid number"), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { |  | ||||||
|     let mut number_words: usize = 0; |  | ||||||
|     let mut sum_typos = 0.0; |  | ||||||
|     let mut index = 0; |  | ||||||
|  |  | ||||||
|     for group in query_index.linear_group() { |  | ||||||
|         sum_typos += custom_log10(distance[index]); |  | ||||||
|         number_words += 1; |  | ||||||
|         index += group.len(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct SumOfTypos; |  | ||||||
|  |  | ||||||
| impl Criterion for SumOfTypos { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             let distance = lhs.distance(); |  | ||||||
|             sum_matches_typos(query_index, distance) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let rhs = { |  | ||||||
|             let query_index = rhs.query_index(); |  | ||||||
|             let distance = rhs.distance(); |  | ||||||
|             sum_matches_typos(query_index, distance) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "SumOfTypos" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     // typing: "Geox CEO" |  | ||||||
|     // |  | ||||||
|     // doc0: "Geox SpA: CEO and Executive" |  | ||||||
|     // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" |  | ||||||
|     #[test] |  | ||||||
|     fn one_typo_reference() { |  | ||||||
|         let query_index0 = &[0, 1]; |  | ||||||
|         let distance0 = &[0, 0]; |  | ||||||
|  |  | ||||||
|         let query_index1 = &[0, 1]; |  | ||||||
|         let distance1 = &[1, 0]; |  | ||||||
|  |  | ||||||
|         let doc0 = sum_matches_typos(query_index0, distance0); |  | ||||||
|         let doc1 = sum_matches_typos(query_index1, distance1); |  | ||||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // typing: "bouton manchette" |  | ||||||
|     // |  | ||||||
|     // doc0: "bouton manchette" |  | ||||||
|     // doc1: "bouton" |  | ||||||
|     #[test] |  | ||||||
|     fn no_typo() { |  | ||||||
|         let query_index0 = &[0, 1]; |  | ||||||
|         let distance0 = &[0, 0]; |  | ||||||
|  |  | ||||||
|         let query_index1 = &[0]; |  | ||||||
|         let distance1 = &[0]; |  | ||||||
|  |  | ||||||
|         let doc0 = sum_matches_typos(query_index0, distance0); |  | ||||||
|         let doc1 = sum_matches_typos(query_index1, distance1); |  | ||||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // typing: "bouton manchztte" |  | ||||||
|     // |  | ||||||
|     // doc0: "bouton manchette" |  | ||||||
|     // doc1: "bouton" |  | ||||||
|     #[test] |  | ||||||
|     fn one_typo() { |  | ||||||
|         let query_index0 = &[0, 1]; |  | ||||||
|         let distance0 = &[0, 1]; |  | ||||||
|  |  | ||||||
|         let query_index1 = &[0]; |  | ||||||
|         let distance1 = &[0]; |  | ||||||
|  |  | ||||||
|         let doc0 = sum_matches_typos(query_index0, distance0); |  | ||||||
|         let doc1 = sum_matches_typos(query_index1, distance1); |  | ||||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,64 +0,0 @@ | |||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
| use slice_group_by::GroupBy; |  | ||||||
| use std::cmp::Ordering; |  | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { |  | ||||||
|     let mut sum_attributes = 0; |  | ||||||
|     let mut index = 0; |  | ||||||
|  |  | ||||||
|     for group in query_index.linear_group() { |  | ||||||
|         sum_attributes += attribute[index] as usize; |  | ||||||
|         index += group.len(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     sum_attributes |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct SumOfWordsAttribute; |  | ||||||
|  |  | ||||||
| impl Criterion for SumOfWordsAttribute { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             let attribute = lhs.attribute(); |  | ||||||
|             sum_matches_attributes(query_index, attribute) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let rhs = { |  | ||||||
|             let query_index = rhs.query_index(); |  | ||||||
|             let attribute = rhs.attribute(); |  | ||||||
|             sum_matches_attributes(query_index, attribute) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "SumOfWordsAttribute" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     // typing: "soulier" |  | ||||||
|     // |  | ||||||
|     // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } |  | ||||||
|     // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } |  | ||||||
|     #[test] |  | ||||||
|     fn title_vs_description() { |  | ||||||
|         let query_index0 = &[0]; |  | ||||||
|         let attribute0 = &[0]; |  | ||||||
|  |  | ||||||
|         let query_index1 = &[0]; |  | ||||||
|         let attribute1 = &[1]; |  | ||||||
|  |  | ||||||
|         let doc0 = sum_matches_attributes(query_index0, attribute0); |  | ||||||
|         let doc1 = sum_matches_attributes(query_index1, attribute1); |  | ||||||
|         assert_eq!(doc0.cmp(&doc1), Ordering::Less); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,64 +0,0 @@ | |||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
| use slice_group_by::GroupBy; |  | ||||||
| use std::cmp::Ordering; |  | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { |  | ||||||
|     let mut sum_word_index = 0; |  | ||||||
|     let mut index = 0; |  | ||||||
|  |  | ||||||
|     for group in query_index.linear_group() { |  | ||||||
|         sum_word_index += word_index[index] as usize; |  | ||||||
|         index += group.len(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     sum_word_index |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct SumOfWordsPosition; |  | ||||||
|  |  | ||||||
| impl Criterion for SumOfWordsPosition { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             let word_index = lhs.word_index(); |  | ||||||
|             sum_matches_attribute_index(query_index, word_index) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let rhs = { |  | ||||||
|             let query_index = rhs.query_index(); |  | ||||||
|             let word_index = rhs.word_index(); |  | ||||||
|             sum_matches_attribute_index(query_index, word_index) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "SumOfWordsPosition" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     // typing: "soulier" |  | ||||||
|     // |  | ||||||
|     // doc0: "Soulier bleu" |  | ||||||
|     // doc1: "Botte rouge et soulier noir" |  | ||||||
|     #[test] |  | ||||||
|     fn easy_case() { |  | ||||||
|         let query_index0 = &[0]; |  | ||||||
|         let word_index0 = &[0]; |  | ||||||
|  |  | ||||||
|         let query_index1 = &[0]; |  | ||||||
|         let word_index1 = &[3]; |  | ||||||
|  |  | ||||||
|         let doc0 = sum_matches_attribute_index(query_index0, word_index0); |  | ||||||
|         let doc1 = sum_matches_attribute_index(query_index1, word_index1); |  | ||||||
|         assert_eq!(doc0.cmp(&doc1), Ordering::Less); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
							
								
								
									
										67
									
								
								meilisearch-core/src/criterion/typo.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								meilisearch-core/src/criterion/typo.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | |||||||
|  | use std::cmp::Ordering; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  |  | ||||||
|  | use super::{Criterion, prepare_query_distances}; | ||||||
|  |  | ||||||
|  | pub struct Typo; | ||||||
|  |  | ||||||
|  | impl Criterion for Typo { | ||||||
|  |     fn name(&self) -> &str { "typo" } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument, | ||||||
|  |         rhs: &RawDocument, | ||||||
|  |         postings_lists: &SmallArena<PostingsListView>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         // This function is a wrong logarithmic 10 function. | ||||||
|  |         // It is safe to panic on input number higher than 3, | ||||||
|  |         // the number of typos is never bigger than that. | ||||||
|  |         #[inline] | ||||||
|  |         fn custom_log10(n: u8) -> f32 { | ||||||
|  |             match n { | ||||||
|  |                 0 => 0.0,     // log(1) | ||||||
|  |                 1 => 0.30102, // log(2) | ||||||
|  |                 2 => 0.47712, // log(3) | ||||||
|  |                 3 => 0.60205, // log(4) | ||||||
|  |                 _ => panic!("invalid number"), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         #[inline] | ||||||
|  |         fn compute_typos(distances: &[Option<u8>]) -> usize { | ||||||
|  |             let mut number_words: usize = 0; | ||||||
|  |             let mut sum_typos = 0.0; | ||||||
|  |  | ||||||
|  |             for distance in distances { | ||||||
|  |                 if let Some(distance) = distance { | ||||||
|  |                     sum_typos += custom_log10(*distance); | ||||||
|  |                     number_words += 1; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = compute_typos(&lhs.processed_distances); | ||||||
|  |         let rhs = compute_typos(&rhs.processed_distances); | ||||||
|  |  | ||||||
|  |         lhs.cmp(&rhs).reverse() | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										43
									
								
								meilisearch-core/src/criterion/words.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								meilisearch-core/src/criterion/words.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | use std::cmp::Ordering; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  |  | ||||||
|  | use super::{Criterion, prepare_query_distances}; | ||||||
|  |  | ||||||
|  | pub struct Words; | ||||||
|  |  | ||||||
|  | impl Criterion for Words { | ||||||
|  |     fn name(&self) -> &str { "words" } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument, | ||||||
|  |         rhs: &RawDocument, | ||||||
|  |         postings_lists: &SmallArena<PostingsListView>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         #[inline] | ||||||
|  |         fn number_of_query_words(distances: &[Option<u8>]) -> usize { | ||||||
|  |             distances.iter().cloned().filter(Option::is_some).count() | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = number_of_query_words(&lhs.processed_distances); | ||||||
|  |         let rhs = number_of_query_words(&rhs.processed_distances); | ||||||
|  |  | ||||||
|  |         lhs.cmp(&rhs).reverse() | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										48
									
								
								meilisearch-core/src/criterion/words_position.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								meilisearch-core/src/criterion/words_position.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | use std::cmp::Ordering; | ||||||
|  |  | ||||||
|  | use compact_arena::SmallArena; | ||||||
|  | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
|  | use crate::automaton::QueryEnhancer; | ||||||
|  | use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; | ||||||
|  | use crate::RawDocument; | ||||||
|  |  | ||||||
|  | use super::{Criterion, prepare_raw_matches}; | ||||||
|  |  | ||||||
|  | pub struct WordsPosition; | ||||||
|  |  | ||||||
|  | impl Criterion for WordsPosition { | ||||||
|  |     fn name(&self) -> &str { "words position" } | ||||||
|  |  | ||||||
|  |     fn prepare<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|  |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |         query_enhancer: &QueryEnhancer, | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |     ) { | ||||||
|  |         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate<'a, 'tag, 'txn>( | ||||||
|  |         &self, | ||||||
|  |         lhs: &RawDocument<'a, 'tag>, | ||||||
|  |         rhs: &RawDocument<'a, 'tag>, | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Ordering | ||||||
|  |     { | ||||||
|  |         #[inline] | ||||||
|  |         fn sum_words_position(matches: &[SimpleMatch]) -> usize { | ||||||
|  |             let mut sum_words_position = 0; | ||||||
|  |             for group in matches.linear_group_by_key(|bm| bm.query_index) { | ||||||
|  |                 sum_words_position += group[0].word_index as usize; | ||||||
|  |             } | ||||||
|  |             sum_words_position | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let lhs = sum_words_position(&lhs.processed_matches); | ||||||
|  |         let rhs = sum_words_position(&rhs.processed_matches); | ||||||
|  |  | ||||||
|  |         lhs.cmp(&rhs) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,164 +0,0 @@ | |||||||
| use crate::criterion::Criterion; |  | ||||||
| use crate::RawDocument; |  | ||||||
| use slice_group_by::GroupBy; |  | ||||||
| use std::cmp::{self, Ordering}; |  | ||||||
|  |  | ||||||
| const MAX_DISTANCE: u16 = 8; |  | ||||||
|  |  | ||||||
| #[inline] |  | ||||||
| fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) { |  | ||||||
|     (a.clone(), b.clone()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn index_proximity(lhs: u16, rhs: u16) -> u16 { |  | ||||||
|     if lhs < rhs { |  | ||||||
|         cmp::min(rhs - lhs, MAX_DISTANCE) |  | ||||||
|     } else { |  | ||||||
|         cmp::min(lhs - rhs, MAX_DISTANCE) + 1 |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { |  | ||||||
|     if lattr != rattr { |  | ||||||
|         return MAX_DISTANCE; |  | ||||||
|     } |  | ||||||
|     index_proximity(lwi, rwi) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { |  | ||||||
|     let mut min_prox = u16::max_value(); |  | ||||||
|  |  | ||||||
|     for a in lattr.iter().zip(lwi) { |  | ||||||
|         for b in rattr.iter().zip(rwi) { |  | ||||||
|             let a = clone_tuple(a); |  | ||||||
|             let b = clone_tuple(b); |  | ||||||
|             min_prox = cmp::min(min_prox, attribute_proximity(a, b)); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     min_prox |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn matches_proximity( |  | ||||||
|     query_index: &[u32], |  | ||||||
|     distance: &[u8], |  | ||||||
|     attribute: &[u16], |  | ||||||
|     word_index: &[u16], |  | ||||||
| ) -> u16 { |  | ||||||
|     let mut query_index_groups = query_index.linear_group(); |  | ||||||
|     let mut proximity = 0; |  | ||||||
|     let mut index = 0; |  | ||||||
|  |  | ||||||
|     let get_attr_wi = |index: usize, group_len: usize| { |  | ||||||
|         // retrieve the first distance group (with the lowest values) |  | ||||||
|         let len = distance[index..index + group_len] |  | ||||||
|             .linear_group() |  | ||||||
|             .next() |  | ||||||
|             .unwrap() |  | ||||||
|             .len(); |  | ||||||
|  |  | ||||||
|         let rattr = &attribute[index..index + len]; |  | ||||||
|         let rwi = &word_index[index..index + len]; |  | ||||||
|  |  | ||||||
|         (rattr, rwi) |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     let mut last = query_index_groups.next().map(|group| { |  | ||||||
|         let attr_wi = get_attr_wi(index, group.len()); |  | ||||||
|         index += group.len(); |  | ||||||
|         attr_wi |  | ||||||
|     }); |  | ||||||
|  |  | ||||||
|     // iter by windows of size 2 |  | ||||||
|     while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { |  | ||||||
|         let attr_wi = get_attr_wi(index, rhs.len()); |  | ||||||
|         proximity += min_proximity(lhs, attr_wi); |  | ||||||
|         last = Some(attr_wi); |  | ||||||
|         index += rhs.len(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     proximity |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| pub struct WordsProximity; |  | ||||||
|  |  | ||||||
| impl Criterion for WordsProximity { |  | ||||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { |  | ||||||
|         let lhs = { |  | ||||||
|             let query_index = lhs.query_index(); |  | ||||||
|             let distance = lhs.distance(); |  | ||||||
|             let attribute = lhs.attribute(); |  | ||||||
|             let word_index = lhs.word_index(); |  | ||||||
|             matches_proximity(query_index, distance, attribute, word_index) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let rhs = { |  | ||||||
|             let query_index = rhs.query_index(); |  | ||||||
|             let distance = rhs.distance(); |  | ||||||
|             let attribute = rhs.attribute(); |  | ||||||
|             let word_index = rhs.word_index(); |  | ||||||
|             matches_proximity(query_index, distance, attribute, word_index) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         "WordsProximity" |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn three_different_attributes() { |  | ||||||
|         // "soup" "of the" "the day" |  | ||||||
|         // |  | ||||||
|         // { id: 0, attr: 0, attr_index: 0 } |  | ||||||
|         // { id: 1, attr: 1, attr_index: 0 } |  | ||||||
|         // { id: 2, attr: 1, attr_index: 1 } |  | ||||||
|         // { id: 2, attr: 2, attr_index: 0 } |  | ||||||
|         // { id: 3, attr: 3, attr_index: 1 } |  | ||||||
|  |  | ||||||
|         let query_index = &[0, 1, 2, 2, 3]; |  | ||||||
|         let distance = &[0, 0, 0, 0, 0]; |  | ||||||
|         let attribute = &[0, 1, 1, 2, 3]; |  | ||||||
|         let word_index = &[0, 0, 1, 0, 1]; |  | ||||||
|  |  | ||||||
|         //   soup -> of = 8 |  | ||||||
|         // + of -> the  = 1 |  | ||||||
|         // + the -> day = 8 (not 1) |  | ||||||
|         assert_eq!( |  | ||||||
|             matches_proximity(query_index, distance, attribute, word_index), |  | ||||||
|             17 |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn two_different_attributes() { |  | ||||||
|         // "soup day" "soup of the day" |  | ||||||
|         // |  | ||||||
|         // { id: 0, attr: 0, attr_index: 0 } |  | ||||||
|         // { id: 0, attr: 1, attr_index: 0 } |  | ||||||
|         // { id: 1, attr: 1, attr_index: 1 } |  | ||||||
|         // { id: 2, attr: 1, attr_index: 2 } |  | ||||||
|         // { id: 3, attr: 0, attr_index: 1 } |  | ||||||
|         // { id: 3, attr: 1, attr_index: 3 } |  | ||||||
|  |  | ||||||
|         let query_index = &[0, 0, 1, 2, 3, 3]; |  | ||||||
|         let distance = &[0, 0, 0, 0, 0, 0]; |  | ||||||
|         let attribute = &[0, 1, 1, 1, 0, 1]; |  | ||||||
|         let word_index = &[0, 0, 1, 2, 1, 3]; |  | ||||||
|  |  | ||||||
|         //   soup -> of = 1 |  | ||||||
|         // + of -> the  = 1 |  | ||||||
|         // + the -> day = 1 |  | ||||||
|         assert_eq!( |  | ||||||
|             matches_proximity(query_index, distance, attribute, word_index), |  | ||||||
|             3 |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,514 +0,0 @@ | |||||||
| use std::cmp::{self, Ordering, Reverse}; |  | ||||||
| use std::borrow::Cow; |  | ||||||
| use std::sync::atomic::{self, AtomicUsize}; |  | ||||||
|  |  | ||||||
| use slice_group_by::{GroupBy, GroupByMut}; |  | ||||||
| use compact_arena::SmallArena; |  | ||||||
| use sdset::{Set, SetBuf}; |  | ||||||
| use log::debug; |  | ||||||
|  |  | ||||||
| use crate::{DocIndex, DocumentId}; |  | ||||||
| use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; |  | ||||||
| use crate::automaton::QueryEnhancer; |  | ||||||
|  |  | ||||||
| type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; |  | ||||||
|  |  | ||||||
| pub trait Criterion { |  | ||||||
|     fn name(&self) -> &str; |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     fn evaluate<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument<'a, 'tag>, |  | ||||||
|         rhs: &RawDocument<'a, 'tag>, |  | ||||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
|     ) -> Ordering; |  | ||||||
|  |  | ||||||
|     #[inline] |  | ||||||
|     fn eq<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument<'a, 'tag>, |  | ||||||
|         rhs: &RawDocument<'a, 'tag>, |  | ||||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
|     ) -> bool |  | ||||||
|     { |  | ||||||
|         self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn prepare_query_distances<'a, 'tag, 'txn>( |  | ||||||
|     documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|     query_enhancer: &QueryEnhancer, |  | ||||||
|     automatons: &[QueryWordAutomaton], |  | ||||||
|     postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
| ) { |  | ||||||
|     for document in documents { |  | ||||||
|         if !document.processed_distances.is_empty() { continue } |  | ||||||
|  |  | ||||||
|         let mut processed = Vec::new(); |  | ||||||
|         for m in document.raw_matches.iter() { |  | ||||||
|             if postings_lists[m.postings_list].is_empty() { continue } |  | ||||||
|  |  | ||||||
|             let range = query_enhancer.replacement(m.query_index as u32); |  | ||||||
|             let new_len = cmp::max(range.end as usize, processed.len()); |  | ||||||
|             processed.resize(new_len, None); |  | ||||||
|  |  | ||||||
|             for index in range { |  | ||||||
|                 let index = index as usize; |  | ||||||
|                 processed[index] = match processed[index] { |  | ||||||
|                     Some(distance) if distance > m.distance => Some(m.distance), |  | ||||||
|                     Some(distance) => Some(distance), |  | ||||||
|                     None => Some(m.distance), |  | ||||||
|                 }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         document.processed_distances = processed; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Typo; |  | ||||||
|  |  | ||||||
| impl Criterion for Typo { |  | ||||||
|     fn name(&self) -> &str { "typo" } |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument, |  | ||||||
|         rhs: &RawDocument, |  | ||||||
|         postings_lists: &PostingsListsArena, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         // This function is a wrong logarithmic 10 function. |  | ||||||
|         // It is safe to panic on input number higher than 3, |  | ||||||
|         // the number of typos is never bigger than that. |  | ||||||
|         #[inline] |  | ||||||
|         fn custom_log10(n: u8) -> f32 { |  | ||||||
|             match n { |  | ||||||
|                 0 => 0.0,     // log(1) |  | ||||||
|                 1 => 0.30102, // log(2) |  | ||||||
|                 2 => 0.47712, // log(3) |  | ||||||
|                 3 => 0.60205, // log(4) |  | ||||||
|                 _ => panic!("invalid number"), |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         #[inline] |  | ||||||
|         fn compute_typos(distances: &[Option<u8>]) -> usize { |  | ||||||
|             let mut number_words: usize = 0; |  | ||||||
|             let mut sum_typos = 0.0; |  | ||||||
|  |  | ||||||
|             for distance in distances { |  | ||||||
|                 if let Some(distance) = distance { |  | ||||||
|                     sum_typos += custom_log10(*distance); |  | ||||||
|                     number_words += 1; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = compute_typos(&lhs.processed_distances); |  | ||||||
|         let rhs = compute_typos(&rhs.processed_distances); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Words; |  | ||||||
|  |  | ||||||
| impl Criterion for Words { |  | ||||||
|     fn name(&self) -> &str { "words" } |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument, |  | ||||||
|         rhs: &RawDocument, |  | ||||||
|         postings_lists: &PostingsListsArena, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         #[inline] |  | ||||||
|         fn number_of_query_words(distances: &[Option<u8>]) -> usize { |  | ||||||
|             distances.iter().cloned().filter(Option::is_some).count() |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = number_of_query_words(&lhs.processed_distances); |  | ||||||
|         let rhs = number_of_query_words(&rhs.processed_distances); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn prepare_raw_matches<'a, 'tag, 'txn>( |  | ||||||
|     documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|     postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|     query_enhancer: &QueryEnhancer, |  | ||||||
|     automatons: &[QueryWordAutomaton], |  | ||||||
| ) { |  | ||||||
|     for document in documents { |  | ||||||
|         if !document.processed_matches.is_empty() { continue } |  | ||||||
|  |  | ||||||
|         let mut processed = Vec::new(); |  | ||||||
|         for m in document.raw_matches.iter() { |  | ||||||
|             let postings_list = &postings_lists[m.postings_list]; |  | ||||||
|             processed.reserve(postings_list.len()); |  | ||||||
|             for di in postings_list.as_ref() { |  | ||||||
|                 let simple_match = SimpleMatch { |  | ||||||
|                     query_index: m.query_index, |  | ||||||
|                     distance: m.distance, |  | ||||||
|                     attribute: di.attribute, |  | ||||||
|                     word_index: di.word_index, |  | ||||||
|                     is_exact: m.is_exact, |  | ||||||
|                 }; |  | ||||||
|                 processed.push(simple_match); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); |  | ||||||
|         document.processed_matches = processed.into_vec(); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Proximity; |  | ||||||
|  |  | ||||||
| impl Criterion for Proximity { |  | ||||||
|     fn name(&self) -> &str { "proximity" } |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument<'a, 'tag>, |  | ||||||
|         rhs: &RawDocument<'a, 'tag>, |  | ||||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         const MAX_DISTANCE: u16 = 8; |  | ||||||
|  |  | ||||||
|         fn index_proximity(lhs: u16, rhs: u16) -> u16 { |  | ||||||
|             if lhs < rhs { |  | ||||||
|                 cmp::min(rhs - lhs, MAX_DISTANCE) |  | ||||||
|             } else { |  | ||||||
|                 cmp::min(lhs - rhs, MAX_DISTANCE) + 1 |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { |  | ||||||
|             if lhs.attribute != rhs.attribute { MAX_DISTANCE } |  | ||||||
|             else { index_proximity(lhs.word_index, rhs.word_index) } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { |  | ||||||
|             let mut min_prox = u16::max_value(); |  | ||||||
|             for a in lhs { |  | ||||||
|                 for b in rhs { |  | ||||||
|                     let prox = attribute_proximity(*a, *b); |  | ||||||
|                     min_prox = cmp::min(min_prox, prox); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|             min_prox |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         fn matches_proximity(matches: &[SimpleMatch],) -> u16 { |  | ||||||
|             let mut proximity = 0; |  | ||||||
|             let mut iter = matches.linear_group_by_key(|m| m.query_index); |  | ||||||
|  |  | ||||||
|             // iterate over groups by windows of size 2 |  | ||||||
|             let mut last = iter.next(); |  | ||||||
|             while let (Some(lhs), Some(rhs)) = (last, iter.next()) { |  | ||||||
|                 proximity += min_proximity(lhs, rhs); |  | ||||||
|                 last = Some(rhs); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             proximity |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = matches_proximity(&lhs.processed_matches); |  | ||||||
|         let rhs = matches_proximity(&rhs.processed_matches); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Attribute; |  | ||||||
|  |  | ||||||
| impl Criterion for Attribute { |  | ||||||
|     fn name(&self) -> &str { "attribute" } |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument<'a, 'tag>, |  | ||||||
|         rhs: &RawDocument<'a, 'tag>, |  | ||||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         #[inline] |  | ||||||
|         fn best_attribute(matches: &[SimpleMatch]) -> u16 { |  | ||||||
|             let mut best_attribute = u16::max_value(); |  | ||||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { |  | ||||||
|                 best_attribute = cmp::min(best_attribute, group[0].attribute); |  | ||||||
|             } |  | ||||||
|             best_attribute |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = best_attribute(&lhs.processed_matches); |  | ||||||
|         let rhs = best_attribute(&rhs.processed_matches); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct WordsPosition; |  | ||||||
|  |  | ||||||
| impl Criterion for WordsPosition { |  | ||||||
|     fn name(&self) -> &str { "words position" } |  | ||||||
|  |  | ||||||
|     fn prepare<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument<'a, 'tag>], |  | ||||||
|         postings_lists: &mut PostingsListsArena<'tag, 'txn>, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate<'a, 'tag, 'txn>( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument<'a, 'tag>, |  | ||||||
|         rhs: &RawDocument<'a, 'tag>, |  | ||||||
|         postings_lists: &PostingsListsArena<'tag, 'txn>, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         #[inline] |  | ||||||
|         fn sum_words_position(matches: &[SimpleMatch]) -> usize { |  | ||||||
|             let mut sum_words_position = 0; |  | ||||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { |  | ||||||
|                 sum_words_position += group[0].word_index as usize; |  | ||||||
|             } |  | ||||||
|             sum_words_position |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = sum_words_position(&lhs.processed_matches); |  | ||||||
|         let rhs = sum_words_position(&rhs.processed_matches); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Exact; |  | ||||||
|  |  | ||||||
| impl Criterion for Exact { |  | ||||||
|     fn name(&self) -> &str { "exact" } |  | ||||||
|  |  | ||||||
|     fn prepare( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument], |  | ||||||
|         postings_lists: &mut PostingsListsArena, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         for document in documents { |  | ||||||
|             document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument, |  | ||||||
|         rhs: &RawDocument, |  | ||||||
|         postings_lists: &PostingsListsArena, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         #[inline] |  | ||||||
|         fn sum_exact_query_words(matches: &[BareMatch]) -> usize { |  | ||||||
|             let mut sum_exact_query_words = 0; |  | ||||||
|  |  | ||||||
|             for group in matches.linear_group_by_key(|bm| bm.query_index) { |  | ||||||
|                 sum_exact_query_words += group[0].is_exact as usize; |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             sum_exact_query_words |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let lhs = sum_exact_query_words(&lhs.raw_matches); |  | ||||||
|         let rhs = sum_exact_query_words(&rhs.raw_matches); |  | ||||||
|  |  | ||||||
|         lhs.cmp(&rhs).reverse() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct StableDocId; |  | ||||||
|  |  | ||||||
| impl Criterion for StableDocId { |  | ||||||
|     fn name(&self) -> &str { "stable document id" } |  | ||||||
|  |  | ||||||
|     fn prepare( |  | ||||||
|         &self, |  | ||||||
|         documents: &mut [RawDocument], |  | ||||||
|         postings_lists: &mut PostingsListsArena, |  | ||||||
|         query_enhancer: &QueryEnhancer, |  | ||||||
|         automatons: &[QueryWordAutomaton], |  | ||||||
|     ) { |  | ||||||
|         // ... |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn evaluate( |  | ||||||
|         &self, |  | ||||||
|         lhs: &RawDocument, |  | ||||||
|         rhs: &RawDocument, |  | ||||||
|         postings_lists: &PostingsListsArena, |  | ||||||
|     ) -> Ordering |  | ||||||
|     { |  | ||||||
|         let lhs = &lhs.raw_matches[0].document_id; |  | ||||||
|         let rhs = &rhs.raw_matches[0].document_id; |  | ||||||
|  |  | ||||||
|         lhs.cmp(rhs) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn multiword_rewrite_matches( |  | ||||||
|     matches: &mut [SimpleMatch], |  | ||||||
|     query_enhancer: &QueryEnhancer, |  | ||||||
|     automatons: &[QueryWordAutomaton], |  | ||||||
| ) -> SetBuf<SimpleMatch> |  | ||||||
| { |  | ||||||
|     matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); |  | ||||||
|  |  | ||||||
|     let mut padded_matches = Vec::with_capacity(matches.len()); |  | ||||||
|  |  | ||||||
|     // let before_padding = Instant::now(); |  | ||||||
|     // for each attribute of each document |  | ||||||
|     for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { |  | ||||||
|         // padding will only be applied |  | ||||||
|         // to word indices in the same attribute |  | ||||||
|         let mut padding = 0; |  | ||||||
|         let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); |  | ||||||
|  |  | ||||||
|         // for each match at the same position |  | ||||||
|         // in this document attribute |  | ||||||
|         while let Some(same_word_index) = iter.next() { |  | ||||||
|             // find the biggest padding |  | ||||||
|             let mut biggest = 0; |  | ||||||
|             for match_ in same_word_index { |  | ||||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index as u32); |  | ||||||
|                 let replacement_len = replacement.len(); |  | ||||||
|                 let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); |  | ||||||
|  |  | ||||||
|                 if let Some(query_index) = replacement.next() { |  | ||||||
|                     let word_index = match_.word_index + padding as u16; |  | ||||||
|                     let query_index = query_index as u16; |  | ||||||
|                     let match_ = SimpleMatch { query_index, word_index, ..*match_ }; |  | ||||||
|                     padded_matches.push(match_); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 let mut found = false; |  | ||||||
|  |  | ||||||
|                 // look ahead and if there already is a match |  | ||||||
|                 // corresponding to this padding word, abort the padding |  | ||||||
|                 'padding: for (x, next_group) in nexts.enumerate() { |  | ||||||
|                     for (i, query_index) in replacement.clone().enumerate().skip(x) { |  | ||||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                         let query_index = query_index as u16; |  | ||||||
|                         let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; |  | ||||||
|  |  | ||||||
|                         for nmatch_ in next_group { |  | ||||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); |  | ||||||
|                             let query_index = rep.next().unwrap() as u16; |  | ||||||
|                             if query_index == padmatch.query_index { |  | ||||||
|                                 if !found { |  | ||||||
|                                     // if we find a corresponding padding for the |  | ||||||
|                                     // first time we must push preceding paddings |  | ||||||
|                                     for (i, query_index) in replacement.clone().enumerate().take(i) |  | ||||||
|                                     { |  | ||||||
|                                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                                         let query_index = query_index as u16; |  | ||||||
|                                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; |  | ||||||
|                                         padded_matches.push(match_); |  | ||||||
|                                         biggest = biggest.max(i + 1); |  | ||||||
|                                     } |  | ||||||
|                                 } |  | ||||||
|  |  | ||||||
|                                 padded_matches.push(padmatch); |  | ||||||
|                                 found = true; |  | ||||||
|                                 continue 'padding; |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // if we do not find a corresponding padding in the |  | ||||||
|                     // next groups so stop here and pad what was found |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 if !found { |  | ||||||
|                     // if no padding was found in the following matches |  | ||||||
|                     // we must insert the entire padding |  | ||||||
|                     for (i, query_index) in replacement.enumerate() { |  | ||||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                         let query_index = query_index as u16; |  | ||||||
|                         let match_ = SimpleMatch { query_index, word_index, ..*match_ }; |  | ||||||
|                         padded_matches.push(match_); |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     biggest = biggest.max(replacement_len - 1); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             padding += biggest; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // debug!("padding matches took {:.02?}", before_padding.elapsed()); |  | ||||||
|  |  | ||||||
|     // With this check we can see that the loop above takes something |  | ||||||
|     // like 43% of the search time even when no rewrite is needed. |  | ||||||
|     // assert_eq!(before_matches, padded_matches); |  | ||||||
|  |  | ||||||
|     SetBuf::from_dirty(padded_matches) |  | ||||||
| } |  | ||||||
| @@ -20,7 +20,6 @@ mod update; | |||||||
|  |  | ||||||
| // TODO replace | // TODO replace | ||||||
| mod bucket_sort; | mod bucket_sort; | ||||||
| mod criterion2; |  | ||||||
|  |  | ||||||
| pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; | pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; | ||||||
| pub use self::error::{Error, MResult}; | pub use self::error::{Error, MResult}; | ||||||
| @@ -31,62 +30,13 @@ pub use self::store::Index; | |||||||
| pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | ||||||
| pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; | pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; | ||||||
|  |  | ||||||
| #[doc(hidden)] |  | ||||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] |  | ||||||
| pub struct TmpMatch { |  | ||||||
|     pub query_index: u32, |  | ||||||
|     pub distance: u8, |  | ||||||
|     pub attribute: u16, |  | ||||||
|     pub word_index: u16, |  | ||||||
|     pub is_exact: bool, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
| pub struct Document { | pub struct Document { | ||||||
|     pub id: DocumentId, |     pub id: DocumentId, | ||||||
|     pub highlights: Vec<Highlight>, |     pub highlights: Vec<Highlight>, | ||||||
|  |  | ||||||
|     #[cfg(test)] |     // #[cfg(test)] | ||||||
|     pub matches: Vec<TmpMatch>, |     // pub matches: Vec<TmpMatch>, | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Document { |  | ||||||
|     #[cfg(not(test))] |  | ||||||
|     fn from_raw(raw: RawDocument) -> Document { |  | ||||||
|         Document { |  | ||||||
|             id: raw.id, |  | ||||||
|             highlights: raw.highlights, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[cfg(test)] |  | ||||||
|     fn from_raw(raw: RawDocument) -> Document { |  | ||||||
|         let len = raw.query_index().len(); |  | ||||||
|         let mut matches = Vec::with_capacity(len); |  | ||||||
|  |  | ||||||
|         let query_index = raw.query_index(); |  | ||||||
|         let distance = raw.distance(); |  | ||||||
|         let attribute = raw.attribute(); |  | ||||||
|         let word_index = raw.word_index(); |  | ||||||
|         let is_exact = raw.is_exact(); |  | ||||||
|  |  | ||||||
|         for i in 0..len { |  | ||||||
|             let match_ = TmpMatch { |  | ||||||
|                 query_index: query_index[i], |  | ||||||
|                 distance: distance[i], |  | ||||||
|                 attribute: attribute[i], |  | ||||||
|                 word_index: word_index[i], |  | ||||||
|                 is_exact: is_exact[i], |  | ||||||
|             }; |  | ||||||
|             matches.push(match_); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Document { |  | ||||||
|             id: raw.id, |  | ||||||
|             matches, |  | ||||||
|             highlights: raw.highlights, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
|   | |||||||
| @@ -1,21 +1,8 @@ | |||||||
| use hashbrown::HashMap; |  | ||||||
| use std::convert::TryFrom; |  | ||||||
| use std::ops::Range; | use std::ops::Range; | ||||||
| use std::rc::Rc; | use std::time::Duration; | ||||||
| use std::time::{Duration, Instant}; |  | ||||||
| use std::{cmp, mem}; |  | ||||||
|  |  | ||||||
| use fst::{IntoStreamer, Streamer}; |  | ||||||
| use log::debug; |  | ||||||
| use sdset::SetBuf; |  | ||||||
| use slice_group_by::{GroupBy, GroupByMut}; |  | ||||||
|  |  | ||||||
| use crate::{bucket_sort::bucket_sort, database::MainT}; | use crate::{bucket_sort::bucket_sort, database::MainT}; | ||||||
| use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; | use crate::{criterion::Criteria, Document, DocumentId}; | ||||||
| use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; |  | ||||||
| use crate::levenshtein::prefix_damerau_levenshtein; |  | ||||||
| use crate::raw_document::{raw_documents_from, RawDocument}; |  | ||||||
| use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; |  | ||||||
| use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | ||||||
|  |  | ||||||
| pub struct QueryBuilder<'c, 'f, 'd> { | pub struct QueryBuilder<'c, 'f, 'd> { | ||||||
| @@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> { | |||||||
|     synonyms_store: store::Synonyms, |     synonyms_store: store::Synonyms, | ||||||
| } | } | ||||||
|  |  | ||||||
| fn multiword_rewrite_matches( |  | ||||||
|     mut matches: Vec<(DocumentId, TmpMatch)>, |  | ||||||
|     query_enhancer: &QueryEnhancer, |  | ||||||
| ) -> SetBuf<(DocumentId, TmpMatch)> { |  | ||||||
|     let mut padded_matches = Vec::with_capacity(matches.len()); |  | ||||||
|  |  | ||||||
|     let before_sort = Instant::now(); |  | ||||||
|     // we sort the matches by word index to make them rewritable |  | ||||||
|     matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); |  | ||||||
|     debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); |  | ||||||
|  |  | ||||||
|     let before_padding = Instant::now(); |  | ||||||
|     // for each attribute of each document |  | ||||||
|     for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { |  | ||||||
|         // padding will only be applied |  | ||||||
|         // to word indices in the same attribute |  | ||||||
|         let mut padding = 0; |  | ||||||
|         let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); |  | ||||||
|  |  | ||||||
|         // for each match at the same position |  | ||||||
|         // in this document attribute |  | ||||||
|         while let Some(same_word_index) = iter.next() { |  | ||||||
|             // find the biggest padding |  | ||||||
|             let mut biggest = 0; |  | ||||||
|             for (id, match_) in same_word_index { |  | ||||||
|                 let mut replacement = query_enhancer.replacement(match_.query_index); |  | ||||||
|                 let replacement_len = replacement.len(); |  | ||||||
|                 let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); |  | ||||||
|  |  | ||||||
|                 if let Some(query_index) = replacement.next() { |  | ||||||
|                     let word_index = match_.word_index + padding as u16; |  | ||||||
|                     let match_ = TmpMatch { |  | ||||||
|                         query_index, |  | ||||||
|                         word_index, |  | ||||||
|                         ..*match_ |  | ||||||
|                     }; |  | ||||||
|                     padded_matches.push((*id, match_)); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 let mut found = false; |  | ||||||
|  |  | ||||||
|                 // look ahead and if there already is a match |  | ||||||
|                 // corresponding to this padding word, abort the padding |  | ||||||
|                 'padding: for (x, next_group) in nexts.enumerate() { |  | ||||||
|                     for (i, query_index) in replacement.clone().enumerate().skip(x) { |  | ||||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                         let padmatch = TmpMatch { |  | ||||||
|                             query_index, |  | ||||||
|                             word_index, |  | ||||||
|                             ..*match_ |  | ||||||
|                         }; |  | ||||||
|  |  | ||||||
|                         for (_, nmatch_) in next_group { |  | ||||||
|                             let mut rep = query_enhancer.replacement(nmatch_.query_index); |  | ||||||
|                             let query_index = rep.next().unwrap(); |  | ||||||
|                             if query_index == padmatch.query_index { |  | ||||||
|                                 if !found { |  | ||||||
|                                     // if we find a corresponding padding for the |  | ||||||
|                                     // first time we must push preceding paddings |  | ||||||
|                                     for (i, query_index) in replacement.clone().enumerate().take(i) |  | ||||||
|                                     { |  | ||||||
|                                         let word_index = |  | ||||||
|                                             match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                                         let match_ = TmpMatch { |  | ||||||
|                                             query_index, |  | ||||||
|                                             word_index, |  | ||||||
|                                             ..*match_ |  | ||||||
|                                         }; |  | ||||||
|                                         padded_matches.push((*id, match_)); |  | ||||||
|                                         biggest = biggest.max(i + 1); |  | ||||||
|                                     } |  | ||||||
|                                 } |  | ||||||
|  |  | ||||||
|                                 padded_matches.push((*id, padmatch)); |  | ||||||
|                                 found = true; |  | ||||||
|                                 continue 'padding; |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // if we do not find a corresponding padding in the |  | ||||||
|                     // next groups so stop here and pad what was found |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 if !found { |  | ||||||
|                     // if no padding was found in the following matches |  | ||||||
|                     // we must insert the entire padding |  | ||||||
|                     for (i, query_index) in replacement.enumerate() { |  | ||||||
|                         let word_index = match_.word_index + padding as u16 + (i + 1) as u16; |  | ||||||
|                         let match_ = TmpMatch { |  | ||||||
|                             query_index, |  | ||||||
|                             word_index, |  | ||||||
|                             ..*match_ |  | ||||||
|                         }; |  | ||||||
|                         padded_matches.push((*id, match_)); |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     biggest = biggest.max(replacement_len - 1); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             padding += biggest; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { |  | ||||||
|         document_matches.sort_unstable(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     debug!("padding matches took {:.02?}", before_padding.elapsed()); |  | ||||||
|  |  | ||||||
|     // With this check we can see that the loop above takes something |  | ||||||
|     // like 43% of the search time even when no rewrite is needed. |  | ||||||
|     // assert_eq!(before_matches, padded_matches); |  | ||||||
|  |  | ||||||
|     SetBuf::new_unchecked(padded_matches) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn fetch_raw_documents( |  | ||||||
|     reader: &heed::RoTxn<MainT>, |  | ||||||
|     automatons_groups: &[AutomatonGroup], |  | ||||||
|     query_enhancer: &QueryEnhancer, |  | ||||||
|     searchables: Option<&ReorderedAttrs>, |  | ||||||
|     main_store: store::Main, |  | ||||||
|     postings_lists_store: store::PostingsLists, |  | ||||||
| ) -> MResult<Vec<RawDocument>> { |  | ||||||
|     let mut matches = Vec::new(); |  | ||||||
|     let mut highlights = Vec::new(); |  | ||||||
|  |  | ||||||
|     let words = match main_store.words_fst(reader)? { |  | ||||||
|         Some(words) => words, |  | ||||||
|         None => return Ok(Vec::new()), |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     let before_automatons_groups_loop = Instant::now(); |  | ||||||
|     let mut doc_indexes_rewrite = Duration::default(); |  | ||||||
|     let mut retrieve_postings_lists = Duration::default(); |  | ||||||
|     let mut stream_reserve = Duration::default(); |  | ||||||
|     let mut covered_area_time = Duration::default(); |  | ||||||
|     let mut eval_time = Duration::default(); |  | ||||||
|  |  | ||||||
|     for group in automatons_groups { |  | ||||||
|         let AutomatonGroup { is_phrase_query, automatons } = group; |  | ||||||
|         let phrase_query_len = automatons.len(); |  | ||||||
|  |  | ||||||
|         let mut tmp_matches = Vec::new(); |  | ||||||
|         for (id, automaton) in automatons.into_iter().enumerate() { |  | ||||||
|             let Automaton { index, is_exact, query_len, query, .. } = automaton; |  | ||||||
|             let dfa = automaton.dfa(); |  | ||||||
|  |  | ||||||
|             let before_stream_loop = Instant::now(); |  | ||||||
|             let mut stream_count = 0; |  | ||||||
|  |  | ||||||
|             let mut stream = words.search(&dfa).into_stream(); |  | ||||||
|             while let Some(input) = stream.next() { |  | ||||||
|                 let before_eval_time = Instant::now(); |  | ||||||
|                 let distance = dfa.eval(input).to_u8(); |  | ||||||
|                 eval_time += before_eval_time.elapsed(); |  | ||||||
|  |  | ||||||
|                 let is_exact = *is_exact && distance == 0 && input.len() == *query_len; |  | ||||||
|  |  | ||||||
|                 stream_count += 1; |  | ||||||
|  |  | ||||||
|                 let before_covered_area = Instant::now(); |  | ||||||
|                 let covered_area = if *query_len > input.len() { |  | ||||||
|                     input.len() |  | ||||||
|                 } else { |  | ||||||
|                     prefix_damerau_levenshtein(query.as_bytes(), input).1 |  | ||||||
|                 }; |  | ||||||
|                 covered_area_time += before_covered_area.elapsed(); |  | ||||||
|  |  | ||||||
|                 let before_retrieve_postings_lists = Instant::now(); |  | ||||||
|                 let doc_indexes = match postings_lists_store.postings_list(reader, input)? { |  | ||||||
|                     Some(doc_indexes) => doc_indexes, |  | ||||||
|                     None => continue, |  | ||||||
|                 }; |  | ||||||
|                 retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); |  | ||||||
|  |  | ||||||
|                 let before_stream_reserve = Instant::now(); |  | ||||||
|                 tmp_matches.reserve(doc_indexes.len()); |  | ||||||
|                 stream_reserve += before_stream_reserve.elapsed(); |  | ||||||
|  |  | ||||||
|                 let before_doc_indexes_rewrite = Instant::now(); |  | ||||||
|                 for di in doc_indexes.as_ref() { |  | ||||||
|                     let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); |  | ||||||
|                     if let Some(attribute) = attribute { |  | ||||||
|                         let match_ = TmpMatch { |  | ||||||
|                             query_index: *index as u32, |  | ||||||
|                             distance, |  | ||||||
|                             attribute, |  | ||||||
|                             word_index: di.word_index, |  | ||||||
|                             is_exact, |  | ||||||
|                         }; |  | ||||||
|  |  | ||||||
|                         let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value()); |  | ||||||
|                         let covered_area = cmp::min(covered_area, di.char_length); |  | ||||||
|  |  | ||||||
|                         let highlight = Highlight { |  | ||||||
|                             attribute: di.attribute, |  | ||||||
|                             char_index: di.char_index, |  | ||||||
|                             char_length: covered_area, |  | ||||||
|                         }; |  | ||||||
|  |  | ||||||
|                         tmp_matches.push((di.document_id, id, match_, highlight)); |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); |  | ||||||
|             } |  | ||||||
|             debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if *is_phrase_query { |  | ||||||
|             tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); |  | ||||||
|             for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { |  | ||||||
|                 for window in group.windows(2) { |  | ||||||
|                     let (ida, ia, ma, ha) = window[0]; |  | ||||||
|                     let (idb, ib, mb, hb) = window[1]; |  | ||||||
|  |  | ||||||
|                     debug_assert_eq!(ida, idb); |  | ||||||
|  |  | ||||||
|                     // if matches must follow and actually follows themselves |  | ||||||
|                     if ia + 1 == ib && ma.word_index + 1 == mb.word_index { |  | ||||||
|                         // TODO we must make it work for phrase query longer than 2 |  | ||||||
|                         // if the second match is the last phrase query word |  | ||||||
|                         if ib + 1 == phrase_query_len { |  | ||||||
|                             // insert first match |  | ||||||
|                             matches.push((ida, ma)); |  | ||||||
|                             highlights.push((ida, ha)); |  | ||||||
|  |  | ||||||
|                             // insert second match |  | ||||||
|                             matches.push((idb, mb)); |  | ||||||
|                             highlights.push((idb, hb)); |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } else { |  | ||||||
|             let before_rerewrite = Instant::now(); |  | ||||||
|  |  | ||||||
|             matches.reserve(tmp_matches.len()); |  | ||||||
|             highlights.reserve(tmp_matches.len()); |  | ||||||
|  |  | ||||||
|             for (id, _, match_, highlight) in tmp_matches { |  | ||||||
|                 matches.push((id, match_)); |  | ||||||
|                 highlights.push((id, highlight)); |  | ||||||
|             } |  | ||||||
|             debug!("rerewrite took {:.02?}", before_rerewrite.elapsed()); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); |  | ||||||
|     debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); |  | ||||||
|     debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); |  | ||||||
|     debug!("stream reserve took {:.02?}", stream_reserve); |  | ||||||
|     debug!("covered area took {:.02?}", covered_area_time); |  | ||||||
|     debug!("eval value took {:.02?}", eval_time); |  | ||||||
|  |  | ||||||
|     // { |  | ||||||
|     //     let mut cloned = matches.clone(); |  | ||||||
|     //     let before_sort_test = Instant::now(); |  | ||||||
|     //     cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); |  | ||||||
|     //     debug!("sorting test took {:.02?}", before_sort_test.elapsed()); |  | ||||||
|     // } |  | ||||||
|  |  | ||||||
|     let before_multiword_rewrite_matches = Instant::now(); |  | ||||||
|     debug!("number of matches before rewrite {}", matches.len()); |  | ||||||
|     debug!("{:?}", query_enhancer); |  | ||||||
|     let matches = multiword_rewrite_matches(matches, &query_enhancer); |  | ||||||
|     debug!("number of matches after rewrite {}", matches.len()); |  | ||||||
|     debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); |  | ||||||
|  |  | ||||||
|     let before_highlight_sorting = Instant::now(); |  | ||||||
|     let highlights = { |  | ||||||
|         highlights.sort_unstable_by_key(|(id, _)| *id); |  | ||||||
|         SetBuf::new_unchecked(highlights) |  | ||||||
|     }; |  | ||||||
|     debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); |  | ||||||
|  |  | ||||||
|     let before_raw_documents = Instant::now(); |  | ||||||
|     let raw_documents = raw_documents_from(matches, highlights); |  | ||||||
|     debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); |  | ||||||
|     debug!("documents to worry about: {}", raw_documents.len()); |  | ||||||
|  |  | ||||||
|     Ok(raw_documents) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | ||||||
|     pub fn new( |     pub fn new( | ||||||
|         main: store::Main, |         main: store::Main, | ||||||
| @@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | |||||||
|                 reader, |                 reader, | ||||||
|                 query, |                 query, | ||||||
|                 range, |                 range, | ||||||
|                 // self.criteria, |                 self.criteria, | ||||||
|                 self.main_store, |                 self.main_store, | ||||||
|                 self.postings_lists_store, |                 self.postings_lists_store, | ||||||
|                 self.documents_fields_counts_store, |                 self.documents_fields_counts_store, | ||||||
|   | |||||||
| @@ -1,183 +1,89 @@ | |||||||
| use std::fmt; | use compact_arena::SmallArena; | ||||||
| use std::sync::Arc; | use itertools::EitherOrBoth; | ||||||
|  |  | ||||||
| use sdset::SetBuf; | use sdset::SetBuf; | ||||||
| use slice_group_by::GroupBy; |  | ||||||
|  |  | ||||||
| use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; | use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; | ||||||
|  |  | ||||||
| #[derive(Clone)] | pub struct RawDocument<'a, 'tag> { | ||||||
| pub struct RawDocument { |     pub id: crate::DocumentId, | ||||||
|     pub id: DocumentId, |     pub raw_matches: &'a mut [BareMatch<'tag>], | ||||||
|     pub matches: SharedMatches, |     pub processed_matches: Vec<SimpleMatch>, | ||||||
|     pub highlights: Vec<Highlight>, |     /// The list of minimum `distance` found | ||||||
|     pub fields_counts: Option<SetBuf<AttrCount>>, |     pub processed_distances: Vec<Option<u8>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl RawDocument { | impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||||
|     pub fn query_index(&self) -> &[u32] { |     pub fn new<'txn>( | ||||||
|         let r = self.matches.range; |         raw_matches: &'a mut [BareMatch<'tag>], | ||||||
|         // it is safe because construction/modifications |         automatons: &[QueryWordAutomaton], | ||||||
|         // can only be done in this module |         postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|         unsafe { |     ) -> Option<RawDocument<'a, 'tag>> | ||||||
|             &self |     { | ||||||
|                 .matches |         raw_matches.sort_unstable_by_key(|m| m.query_index); | ||||||
|                 .matches |  | ||||||
|                 .query_index |         let mut previous_word = None; | ||||||
|                 .get_unchecked(r.start..r.end) |         for i in 0..raw_matches.len() { | ||||||
|  |             let a = &raw_matches[i]; | ||||||
|  |             let auta = &automatons[a.query_index as usize]; | ||||||
|  |  | ||||||
|  |             match auta.phrase_query { | ||||||
|  |                 Some((0, _)) => { | ||||||
|  |                     let b = match raw_matches.get(i + 1) { | ||||||
|  |                         Some(b) => b, | ||||||
|  |                         None => { | ||||||
|  |                             postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||||
|  |                             continue; | ||||||
|  |                         } | ||||||
|  |                     }; | ||||||
|  |  | ||||||
|  |                     if a.query_index + 1 != b.query_index { | ||||||
|  |                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||||
|  |                         continue | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     let pla = &postings_lists[a.postings_list]; | ||||||
|  |                     let plb = &postings_lists[b.postings_list]; | ||||||
|  |  | ||||||
|  |                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||||
|  |                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||||
|  |                     }); | ||||||
|  |  | ||||||
|  |                     let mut newa = Vec::new(); | ||||||
|  |                     let mut newb = Vec::new(); | ||||||
|  |  | ||||||
|  |                     for eb in iter { | ||||||
|  |                         if let EitherOrBoth::Both(a, b) = eb { | ||||||
|  |                             newa.push(*a); | ||||||
|  |                             newb.push(*b); | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|     pub fn distance(&self) -> &[u8] { |                     if !newa.is_empty() { | ||||||
|         let r = self.matches.range; |                         previous_word = Some(a.query_index); | ||||||
|         // it is safe because construction/modifications |  | ||||||
|         // can only be done in this module |  | ||||||
|         unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } |  | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|     pub fn attribute(&self) -> &[u16] { |                     postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); | ||||||
|         let r = self.matches.range; |                     postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); | ||||||
|         // it is safe because construction/modifications |                 }, | ||||||
|         // can only be done in this module |                 Some((1, _)) => { | ||||||
|         unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } |                     if previous_word.take() != Some(a.query_index - 1) { | ||||||
|  |                         postings_lists[a.postings_list].rewrite_with(SetBuf::default()); | ||||||
|                     } |                     } | ||||||
|  |                 }, | ||||||
|     pub fn word_index(&self) -> &[u16] { |                 Some((_, _)) => unreachable!(), | ||||||
|         let r = self.matches.range; |                 None => (), | ||||||
|         // it is safe because construction/modifications |  | ||||||
|         // can only be done in this module |  | ||||||
|         unsafe { |  | ||||||
|             &self |  | ||||||
|                 .matches |  | ||||||
|                 .matches |  | ||||||
|                 .word_index |  | ||||||
|                 .get_unchecked(r.start..r.end) |  | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     pub fn is_exact(&self) -> &[bool] { |         if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { | ||||||
|         let r = self.matches.range; |             return None | ||||||
|         // it is safe because construction/modifications |  | ||||||
|         // can only be done in this module |  | ||||||
|         unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl fmt::Debug for RawDocument { |  | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |  | ||||||
|         f.write_str("RawDocument {\r\n")?; |  | ||||||
|         f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; |  | ||||||
|         f.write_fmt(format_args!( |  | ||||||
|             "{:>15}: {:^5?},\r\n", |  | ||||||
|             "query_index", |  | ||||||
|             self.query_index() |  | ||||||
|         ))?; |  | ||||||
|         f.write_fmt(format_args!( |  | ||||||
|             "{:>15}: {:^5?},\r\n", |  | ||||||
|             "distance", |  | ||||||
|             self.distance() |  | ||||||
|         ))?; |  | ||||||
|         f.write_fmt(format_args!( |  | ||||||
|             "{:>15}: {:^5?},\r\n", |  | ||||||
|             "attribute", |  | ||||||
|             self.attribute() |  | ||||||
|         ))?; |  | ||||||
|         f.write_fmt(format_args!( |  | ||||||
|             "{:>15}: {:^5?},\r\n", |  | ||||||
|             "word_index", |  | ||||||
|             self.word_index() |  | ||||||
|         ))?; |  | ||||||
|         f.write_fmt(format_args!( |  | ||||||
|             "{:>15}: {:^5?},\r\n", |  | ||||||
|             "is_exact", |  | ||||||
|             self.is_exact() |  | ||||||
|         ))?; |  | ||||||
|         f.write_str("}")?; |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn raw_documents_from( |  | ||||||
|     matches: SetBuf<(DocumentId, TmpMatch)>, |  | ||||||
|     highlights: SetBuf<(DocumentId, Highlight)> |  | ||||||
| ) -> Vec<RawDocument> { |  | ||||||
|     let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); |  | ||||||
|     let mut matches2 = Matches::with_capacity(matches.len()); |  | ||||||
|  |  | ||||||
|     let matches = matches.linear_group_by_key(|(id, _)| *id); |  | ||||||
|     let highlights = highlights.linear_group_by_key(|(id, _)| *id); |  | ||||||
|  |  | ||||||
|     for (mgroup, hgroup) in matches.zip(highlights) { |  | ||||||
|         assert_eq!(mgroup[0].0, hgroup[0].0); |  | ||||||
|  |  | ||||||
|         let document_id = mgroup[0].0; |  | ||||||
|         let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); |  | ||||||
|         let end = start + mgroup.len(); |  | ||||||
|         let highlights = hgroup.iter().map(|(_, h)| *h).collect(); |  | ||||||
|         let fields_counts = None; |  | ||||||
|  |  | ||||||
|         docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); |  | ||||||
|         // TODO we could try to keep both data |  | ||||||
|         //  - the data oriented one and, |  | ||||||
|         //  - the raw one, the one that comes from the arguments of this function |  | ||||||
|         // This way we would be able to only produce data oriented lazily. |  | ||||||
|         // |  | ||||||
|         // For example the default first criterion is `SumOfTypos` |  | ||||||
|         // and just needs the `query_index` and the `distance` fields. |  | ||||||
|         // It would probably be good to avoid wasting time sorting other fields of documents |  | ||||||
|         // that will never ever reach the second criterion. |  | ||||||
|         matches2.extend_from_slice(mgroup); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     let matches = Arc::new(matches2); |         Some(RawDocument { | ||||||
|     docs_ranges |             id: raw_matches[0].document_id, | ||||||
|         .into_iter() |             raw_matches, | ||||||
|         .map(|(id, range, highlights, fields_counts)| { |             processed_matches: Vec::new(), | ||||||
|             let matches = SharedMatches { range, matches: matches.clone() }; |             processed_distances: Vec::new(), | ||||||
|             RawDocument { id, matches, highlights, fields_counts } |  | ||||||
|         }) |         }) | ||||||
|         .collect() |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug, Copy, Clone)] |  | ||||||
| struct Range { |  | ||||||
|     start: usize, |  | ||||||
|     end: usize, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Clone)] |  | ||||||
| pub struct SharedMatches { |  | ||||||
|     range: Range, |  | ||||||
|     matches: Arc<Matches>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Clone)] |  | ||||||
| struct Matches { |  | ||||||
|     query_index: Vec<u32>, |  | ||||||
|     distance: Vec<u8>, |  | ||||||
|     attribute: Vec<u16>, |  | ||||||
|     word_index: Vec<u16>, |  | ||||||
|     is_exact: Vec<bool>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Matches { |  | ||||||
|     fn with_capacity(cap: usize) -> Matches { |  | ||||||
|         Matches { |  | ||||||
|             query_index: Vec::with_capacity(cap), |  | ||||||
|             distance: Vec::with_capacity(cap), |  | ||||||
|             attribute: Vec::with_capacity(cap), |  | ||||||
|             word_index: Vec::with_capacity(cap), |  | ||||||
|             is_exact: Vec::with_capacity(cap), |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { |  | ||||||
|         for (_, match_) in matches { |  | ||||||
|             self.query_index.push(match_.query_index); |  | ||||||
|             self.distance.push(match_.distance); |  | ||||||
|             self.attribute.push(match_.attribute); |  | ||||||
|             self.word_index.push(match_.word_index); |  | ||||||
|             self.is_exact.push(match_.is_exact); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> { | |||||||
|             if let Some(ranking_rules_order) = ranking_order { |             if let Some(ranking_rules_order) = ranking_order { | ||||||
|                 for rule in ranking_rules_order { |                 for rule in ranking_rules_order { | ||||||
|                     match rule.as_str() { |                     match rule.as_str() { | ||||||
|                         "_sum_of_typos" => builder.push(SumOfTypos), |                         "_typo" => builder.push(Typo), | ||||||
|                         "_number_of_words" => builder.push(NumberOfWords), |                         "_words" => builder.push(Words), | ||||||
|                         "_word_proximity" => builder.push(WordsProximity), |                         "_proximity" => builder.push(Proximity), | ||||||
|                         "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), |                         "_attribute" => builder.push(Attribute), | ||||||
|                         "_sum_of_words_position" => builder.push(SumOfWordsPosition), |                         "_words_position" => builder.push(WordsPosition), | ||||||
|                         "_exact" => builder.push(Exact), |                         "_exact" => builder.push(Exact), | ||||||
|                         _ => { |                         _ => { | ||||||
|                             let order = match ranking_rules.get(rule.as_str()) { |                             let order = match ranking_rules.get(rule.as_str()) { | ||||||
| @@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> { | |||||||
|                 builder.push(DocumentId); |                 builder.push(DocumentId); | ||||||
|                 return Ok(Some(builder.build())); |                 return Ok(Some(builder.build())); | ||||||
|             } else { |             } else { | ||||||
|                 builder.push(SumOfTypos); |                 builder.push(Typo); | ||||||
|                 builder.push(NumberOfWords); |                 builder.push(Words); | ||||||
|                 builder.push(WordsProximity); |                 builder.push(Proximity); | ||||||
|                 builder.push(SumOfWordsAttribute); |                 builder.push(Attribute); | ||||||
|                 builder.push(SumOfWordsPosition); |                 builder.push(WordsPosition); | ||||||
|                 builder.push(Exact); |                 builder.push(Exact); | ||||||
|                 for (rule, order) in ranking_rules.iter() { |                 for (rule, order) in ranking_rules.iter() { | ||||||
|                     let custom_ranking = match order { |                     let custom_ranking = match order { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user