mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	First probably working phrase query doc filtering
This commit is contained in:
		
							
								
								
									
										1
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -952,6 +952,7 @@ dependencies = [ | |||||||
|  "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", |  "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", |  "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", |  "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", |  "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", |  "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", |  "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ env_logger = "0.7.0" | |||||||
| fst = { version = "0.3.5", default-features = false } | fst = { version = "0.3.5", default-features = false } | ||||||
| hashbrown = { version = "0.6.0", features = ["serde"] } | hashbrown = { version = "0.6.0", features = ["serde"] } | ||||||
| heed = "0.6.1" | heed = "0.6.1" | ||||||
|  | itertools = "0.8.2" # kill me please | ||||||
| levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } | ||||||
| log = "0.4.8" | log = "0.4.8" | ||||||
| meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } | meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } | ||||||
|   | |||||||
| @@ -59,11 +59,9 @@ pub fn bucket_sort<'c>( | |||||||
|     let before_raw_documents_building = Instant::now(); |     let before_raw_documents_building = Instant::now(); | ||||||
|     let mut raw_documents = Vec::new(); |     let mut raw_documents = Vec::new(); | ||||||
|     for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { |     for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { | ||||||
|         raw_documents.push(RawDocument { |         if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { | ||||||
|             raw_matches, |             raw_documents.push(raw_document); | ||||||
|             processed_matches: Vec::new(), |         } | ||||||
|             processed_distances: Vec::new(), |  | ||||||
|         }); |  | ||||||
|     } |     } | ||||||
|     debug!("creating {} candidates documents took {:.02?}", |     debug!("creating {} candidates documents took {:.02?}", | ||||||
|         raw_documents.len(), |         raw_documents.len(), | ||||||
| @@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> { | |||||||
|     pub processed_distances: Vec<Option<u8>>, |     pub processed_distances: Vec<Option<u8>>, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl<'a, 'tag> RawDocument<'a, 'tag> { | ||||||
|  |     fn new<'txn>( | ||||||
|  |         raw_matches: &'a mut [BareMatch<'tag>], | ||||||
|  |         automatons: &[QueryWordAutomaton], | ||||||
|  |         postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, | ||||||
|  |     ) -> Option<RawDocument<'a, 'tag>> | ||||||
|  |     { | ||||||
|  |         raw_matches.sort_unstable_by_key(|m| m.query_index); | ||||||
|  |  | ||||||
|  |         // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches); | ||||||
|  |  | ||||||
|  |         let mut previous_word = None; | ||||||
|  |         for i in 0..raw_matches.len() { | ||||||
|  |             let a = &raw_matches[i]; | ||||||
|  |             let auta = &automatons[a.query_index as usize]; | ||||||
|  |  | ||||||
|  |             match auta.phrase_query { | ||||||
|  |                 Some((0, _)) => { | ||||||
|  |                     previous_word = Some(a.query_index); | ||||||
|  |                     let b = raw_matches.get(i + 1)?; | ||||||
|  |                     if a.query_index + 1 != b.query_index { | ||||||
|  |                         return None; | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     let pla = &postings_lists[a.postings_list]; | ||||||
|  |                     let plb = &postings_lists[b.postings_list]; | ||||||
|  |  | ||||||
|  |                     let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { | ||||||
|  |                         a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) | ||||||
|  |                     }); | ||||||
|  |  | ||||||
|  |                     if !iter.any(|eb| eb.is_both()) { return None } | ||||||
|  |                 }, | ||||||
|  |                 Some((1, _)) => { | ||||||
|  |                     if previous_word.take() != Some(a.query_index - 1) { | ||||||
|  |                         return None; | ||||||
|  |                     } | ||||||
|  |                 }, | ||||||
|  |                 Some((_, _)) => unreachable!(), | ||||||
|  |                 None => (), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Some(RawDocument { | ||||||
|  |             raw_matches, | ||||||
|  |             processed_matches: Vec::new(), | ||||||
|  |             processed_distances: Vec::new(), | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| pub struct BareMatch<'tag> { | pub struct BareMatch<'tag> { | ||||||
|     pub document_id: DocumentId, |     pub document_id: DocumentId, | ||||||
|     pub query_index: u16, |     pub query_index: u16, | ||||||
| @@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> { | |||||||
|     len: usize, |     len: usize, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for PostingsListView<'_> { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|  |         f.debug_struct("PostingsListView") | ||||||
|  |             .field("input", &std::str::from_utf8(&self.input).unwrap()) | ||||||
|  |             .field("postings_list", &self.as_ref()) | ||||||
|  |             .finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl<'txn> PostingsListView<'txn> { | impl<'txn> PostingsListView<'txn> { | ||||||
|     pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> { |     pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> { | ||||||
|         let len = postings_list.len(); |         let len = postings_list.len(); | ||||||
| @@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>( | |||||||
|                 let input = Rc::from(input); |                 let input = Rc::from(input); | ||||||
|                 let postings_list = Rc::new(postings_list); |                 let postings_list = Rc::new(postings_list); | ||||||
|                 let postings_list_view = PostingsListView::new(input, postings_list); |                 let postings_list_view = PostingsListView::new(input, postings_list); | ||||||
|  |  | ||||||
|                 let mut offset = 0; |                 let mut offset = 0; | ||||||
|                 for group in postings_list_view.linear_group_by_key(|di| di.document_id) { |                 for group in postings_list_view.linear_group_by_key(|di| di.document_id) { | ||||||
|  |  | ||||||
| @@ -442,7 +501,7 @@ fn construct_automatons2( | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             if false && n == 1 { |             if true && n == 1 { | ||||||
|                 if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { |                 if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { | ||||||
|                     let mut left_automaton = QueryWordAutomaton::exact(left); |                     let mut left_automaton = QueryWordAutomaton::exact(left); | ||||||
|                     left_automaton.phrase_query = Some((0, 2)); |                     left_automaton.phrase_query = Some((0, 2)); | ||||||
|   | |||||||
| @@ -43,16 +43,42 @@ pub trait Criterion { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn prepare_query_distances( | fn prepare_query_distances<'a, 'tag, 'txn>( | ||||||
|     documents: &mut [RawDocument], |     documents: &mut [RawDocument<'a, 'tag>], | ||||||
|     query_enhancer: &QueryEnhancer, |     query_enhancer: &QueryEnhancer, | ||||||
|     automatons: &[QueryWordAutomaton], |     automatons: &[QueryWordAutomaton], | ||||||
|  |     postings_lists: &PostingsListsArena<'tag, 'txn>, | ||||||
| ) { | ) { | ||||||
|     for document in documents { |     for document in documents { | ||||||
|         if !document.processed_distances.is_empty() { continue } |         if !document.processed_distances.is_empty() { continue } | ||||||
|  |  | ||||||
|  |         // debug!("{:?}", document.raw_matches[0].document_id); | ||||||
|  |  | ||||||
|         let mut processed = Vec::new(); |         let mut processed = Vec::new(); | ||||||
|         for m in document.raw_matches.iter() { |         let mut raw_matches = document.raw_matches.iter().peekable(); | ||||||
|  |         while let Some(m) = raw_matches.next() { | ||||||
|  |  | ||||||
|  |             // let automaton = &automatons[m.query_index as usize]; | ||||||
|  |  | ||||||
|  |             // debug!("{:?} {:?}", m, automaton); | ||||||
|  |             // debug!("{:?}", &postings_lists[m.postings_list]); | ||||||
|  |  | ||||||
|  |             // match automaton.phrase_query { | ||||||
|  |             //     Some((0, len)) => { | ||||||
|  |             //         match raw_matches.peek() { | ||||||
|  |             //             Some(BareMatch { query_index, .. }) => { | ||||||
|  |             //                 if *query_index != m.query_index + 1 { | ||||||
|  |             //                     raw_matches.next(); | ||||||
|  |             //                     continue | ||||||
|  |             //                 } | ||||||
|  |             //             }, | ||||||
|  |             //             None => continue, | ||||||
|  |             //         } | ||||||
|  |             //     }, | ||||||
|  |             //     Some((_, _)) => continue, | ||||||
|  |             //     None => (), | ||||||
|  |             // } | ||||||
|  |  | ||||||
|             // FIXME we really need to take splitted words into account |             // FIXME we really need to take splitted words into account | ||||||
|             //       those must be seen at the same level as the non-splitteds |             //       those must be seen at the same level as the non-splitteds | ||||||
|             // if automatons[m.query_index as usize].phrase_query.is_some() { |             // if automatons[m.query_index as usize].phrase_query.is_some() { | ||||||
| @@ -73,6 +99,8 @@ fn prepare_query_distances( | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // debug!("{:?}", processed); | ||||||
|  |  | ||||||
|         document.processed_distances = processed; |         document.processed_distances = processed; | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -82,14 +110,14 @@ pub struct Typo; | |||||||
| impl Criterion for Typo { | impl Criterion for Typo { | ||||||
|     fn name(&self) -> &str { "typo" } |     fn name(&self) -> &str { "typo" } | ||||||
|  |  | ||||||
|     fn prepare( |     fn prepare<'a, 'tag, 'txn>( | ||||||
|         &self, |         &self, | ||||||
|         documents: &mut [RawDocument], |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|         postings_lists: &mut PostingsListsArena, |         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||||
|         query_enhancer: &QueryEnhancer, |         query_enhancer: &QueryEnhancer, | ||||||
|         automatons: &[QueryWordAutomaton], |         automatons: &[QueryWordAutomaton], | ||||||
|     ) { |     ) { | ||||||
|         prepare_query_distances(documents, query_enhancer, automatons); |         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn evaluate( |     fn evaluate( | ||||||
| @@ -140,14 +168,14 @@ pub struct Words; | |||||||
| impl Criterion for Words { | impl Criterion for Words { | ||||||
|     fn name(&self) -> &str { "words" } |     fn name(&self) -> &str { "words" } | ||||||
|  |  | ||||||
|     fn prepare( |     fn prepare<'a, 'tag, 'txn>( | ||||||
|         &self, |         &self, | ||||||
|         documents: &mut [RawDocument], |         documents: &mut [RawDocument<'a, 'tag>], | ||||||
|         postings_lists: &mut PostingsListsArena, |         postings_lists: &mut PostingsListsArena<'tag, 'txn>, | ||||||
|         query_enhancer: &QueryEnhancer, |         query_enhancer: &QueryEnhancer, | ||||||
|         automatons: &[QueryWordAutomaton], |         automatons: &[QueryWordAutomaton], | ||||||
|     ) { |     ) { | ||||||
|         prepare_query_distances(documents, query_enhancer, automatons); |         prepare_query_distances(documents, query_enhancer, automatons, postings_lists); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn evaluate( |     fn evaluate( | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user