mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Clean up the fetch algorithm
This commit is contained in:
		| @@ -29,8 +29,13 @@ impl AutomatonProducer { | |||||||
|         postings_list_store: store::PostingsLists, |         postings_list_store: store::PostingsLists, | ||||||
|         synonyms_store: store::Synonyms, |         synonyms_store: store::Synonyms, | ||||||
|     ) -> MResult<(AutomatonProducer, QueryEnhancer)> { |     ) -> MResult<(AutomatonProducer, QueryEnhancer)> { | ||||||
|         let (automatons, query_enhancer) = |         let (automatons, query_enhancer) = generate_automatons( | ||||||
|             generate_automatons(reader, query, main_store, postings_list_store, synonyms_store)?; |             reader, | ||||||
|  |             query, | ||||||
|  |             main_store, | ||||||
|  |             postings_list_store, | ||||||
|  |             synonyms_store, | ||||||
|  |         )?; | ||||||
|  |  | ||||||
|         Ok((AutomatonProducer { automatons }, query_enhancer)) |         Ok((AutomatonProducer { automatons }, query_enhancer)) | ||||||
|     } |     } | ||||||
| @@ -41,9 +46,25 @@ impl AutomatonProducer { | |||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub enum AutomatonGroup { | pub struct AutomatonGroup { | ||||||
|     Normal(Vec<Automaton>), |     pub is_phrase_query: bool, | ||||||
|     PhraseQuery(Vec<Automaton>), |     pub automatons: Vec<Automaton>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl AutomatonGroup { | ||||||
|  |     fn normal(automatons: Vec<Automaton>) -> AutomatonGroup { | ||||||
|  |         AutomatonGroup { | ||||||
|  |             is_phrase_query: false, | ||||||
|  |             automatons, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup { | ||||||
|  |         AutomatonGroup { | ||||||
|  |             is_phrase_query: true, | ||||||
|  |             automatons, | ||||||
|  |         } | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| @@ -143,8 +164,7 @@ fn generate_automatons( | |||||||
|     main_store: store::Main, |     main_store: store::Main, | ||||||
|     postings_lists_store: store::PostingsLists, |     postings_lists_store: store::PostingsLists, | ||||||
|     synonym_store: store::Synonyms, |     synonym_store: store::Synonyms, | ||||||
| ) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> | ) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> { | ||||||
| { |  | ||||||
|     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); |     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); | ||||||
|     let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); |     let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); | ||||||
|     let synonyms = match main_store.synonyms_fst(reader)? { |     let synonyms = match main_store.synonyms_fst(reader)? { | ||||||
| @@ -173,7 +193,7 @@ fn generate_automatons( | |||||||
|         original_automatons.push(automaton); |         original_automatons.push(automaton); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     automatons.push(AutomatonGroup::Normal(original_automatons)); |     automatons.push(AutomatonGroup::normal(original_automatons)); | ||||||
|  |  | ||||||
|     for n in 1..=NGRAMS { |     for n in 1..=NGRAMS { | ||||||
|         let mut ngrams = query_words.windows(n).enumerate().peekable(); |         let mut ngrams = query_words.windows(n).enumerate().peekable(); | ||||||
| @@ -225,14 +245,16 @@ fn generate_automatons( | |||||||
|                                 Automaton::non_exact(automaton_index, n, synonym) |                                 Automaton::non_exact(automaton_index, n, synonym) | ||||||
|                             }; |                             }; | ||||||
|                             automaton_index += 1; |                             automaton_index += 1; | ||||||
|                             automatons.push(AutomatonGroup::Normal(vec![automaton])); |                             automatons.push(AutomatonGroup::normal(vec![automaton])); | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             if n == 1 { |             if n == 1 { | ||||||
|                 if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { |                 if let Some((left, right)) = | ||||||
|  |                     split_best_frequency(reader, &normalized, postings_lists_store)? | ||||||
|  |                 { | ||||||
|                     let a = Automaton::exact(automaton_index, 1, left); |                     let a = Automaton::exact(automaton_index, 1, left); | ||||||
|                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); |                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); | ||||||
|                     automaton_index += 1; |                     automaton_index += 1; | ||||||
| @@ -241,7 +263,7 @@ fn generate_automatons( | |||||||
|                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); |                     enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); | ||||||
|                     automaton_index += 1; |                     automaton_index += 1; | ||||||
|  |  | ||||||
|                     automatons.push(AutomatonGroup::PhraseQuery(vec![a, b])); |                     automatons.push(AutomatonGroup::phrase_query(vec![a, b])); | ||||||
|                 } |                 } | ||||||
|             } else { |             } else { | ||||||
|                 // automaton of concatenation of query words |                 // automaton of concatenation of query words | ||||||
| @@ -253,7 +275,7 @@ fn generate_automatons( | |||||||
|  |  | ||||||
|                 let automaton = Automaton::exact(automaton_index, n, &normalized); |                 let automaton = Automaton::exact(automaton_index, n, &normalized); | ||||||
|                 automaton_index += 1; |                 automaton_index += 1; | ||||||
|                 automatons.push(AutomatonGroup::Normal(vec![automaton])); |                 automatons.push(AutomatonGroup::normal(vec![automaton])); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -261,10 +283,7 @@ fn generate_automatons( | |||||||
|     // order automatons, the most important first, |     // order automatons, the most important first, | ||||||
|     // we keep the original automatons at the front. |     // we keep the original automatons at the front. | ||||||
|     automatons[1..].sort_by_key(|group| { |     automatons[1..].sort_by_key(|group| { | ||||||
|         let a = match group { |         let a = group.automatons.first().unwrap(); | ||||||
|             AutomatonGroup::Normal(group) => group.first().unwrap(), |  | ||||||
|             AutomatonGroup::PhraseQuery(group) => group.first().unwrap(), |  | ||||||
|         }; |  | ||||||
|         (Reverse(a.is_exact), a.ngram) |         (Reverse(a.is_exact), a.ngram) | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -149,128 +149,92 @@ fn fetch_raw_documents( | |||||||
|     let mut highlights = Vec::new(); |     let mut highlights = Vec::new(); | ||||||
|  |  | ||||||
|     for group in automatons_groups { |     for group in automatons_groups { | ||||||
|         match group { |         let AutomatonGroup { | ||||||
|             AutomatonGroup::Normal(automatons) => { |             is_phrase_query, | ||||||
|                 for automaton in automatons { |             automatons, | ||||||
|                     let Automaton { index, is_exact, query_len, .. } = automaton; |         } = group; | ||||||
|                     let dfa = automaton.dfa(); |         let phrase_query_len = automatons.len(); | ||||||
|  |  | ||||||
|                     let words = match main_store.words_fst(reader)? { |         let mut tmp_matches = Vec::new(); | ||||||
|                         Some(words) => words, |         for (id, automaton) in automatons.into_iter().enumerate() { | ||||||
|                         None => return Ok(Vec::new()), |             let Automaton { | ||||||
|                     }; |                 index, | ||||||
|  |                 is_exact, | ||||||
|  |                 query_len, | ||||||
|  |                 .. | ||||||
|  |             } = automaton; | ||||||
|  |             let dfa = automaton.dfa(); | ||||||
|  |  | ||||||
|                     let mut stream = words.search(&dfa).into_stream(); |             let words = match main_store.words_fst(reader)? { | ||||||
|                     while let Some(input) = stream.next() { |                 Some(words) => words, | ||||||
|                         let distance = dfa.eval(input).to_u8(); |                 None => return Ok(Vec::new()), | ||||||
|                         let is_exact = *is_exact && distance == 0 && input.len() == *query_len; |             }; | ||||||
|  |  | ||||||
|                         let doc_indexes = match postings_lists_store.postings_list(reader, input)? { |             let mut stream = words.search(&dfa).into_stream(); | ||||||
|                             Some(doc_indexes) => doc_indexes, |             while let Some(input) = stream.next() { | ||||||
|                             None => continue, |                 let distance = dfa.eval(input).to_u8(); | ||||||
|  |                 let is_exact = *is_exact && distance == 0 && input.len() == *query_len; | ||||||
|  |  | ||||||
|  |                 let doc_indexes = match postings_lists_store.postings_list(reader, input)? { | ||||||
|  |                     Some(doc_indexes) => doc_indexes, | ||||||
|  |                     None => continue, | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |                 tmp_matches.reserve(doc_indexes.len()); | ||||||
|  |  | ||||||
|  |                 for di in doc_indexes.as_ref() { | ||||||
|  |                     let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); | ||||||
|  |                     if let Some(attribute) = attribute { | ||||||
|  |                         let match_ = TmpMatch { | ||||||
|  |                             query_index: *index as u32, | ||||||
|  |                             distance, | ||||||
|  |                             attribute, | ||||||
|  |                             word_index: di.word_index, | ||||||
|  |                             is_exact, | ||||||
|                         }; |                         }; | ||||||
|  |  | ||||||
|                         matches.reserve(doc_indexes.len()); |                         let highlight = Highlight { | ||||||
|                         highlights.reserve(doc_indexes.len()); |                             attribute: di.attribute, | ||||||
|  |                             char_index: di.char_index, | ||||||
|                         for di in doc_indexes.as_ref() { |                             char_length: di.char_length, | ||||||
|                             let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); |  | ||||||
|                             if let Some(attribute) = attribute { |  | ||||||
|                                 let match_ = TmpMatch { |  | ||||||
|                                     query_index: *index as u32, |  | ||||||
|                                     distance, |  | ||||||
|                                     attribute, |  | ||||||
|                                     word_index: di.word_index, |  | ||||||
|                                     is_exact, |  | ||||||
|                                 }; |  | ||||||
|  |  | ||||||
|                                 let highlight = Highlight { |  | ||||||
|                                     attribute: di.attribute, |  | ||||||
|                                     char_index: di.char_index, |  | ||||||
|                                     char_length: di.char_length, |  | ||||||
|                                 }; |  | ||||||
|  |  | ||||||
|                                 matches.push((di.document_id, match_)); |  | ||||||
|                                 highlights.push((di.document_id, highlight)); |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             }, |  | ||||||
|             AutomatonGroup::PhraseQuery(automatons) => { |  | ||||||
|                 let mut tmp_matches = Vec::new(); |  | ||||||
|                 let phrase_query_len = automatons.len(); |  | ||||||
|  |  | ||||||
|                 for (id, automaton) in automatons.into_iter().enumerate() { |  | ||||||
|                     let Automaton { index, is_exact, query_len, .. } = automaton; |  | ||||||
|                     let dfa = automaton.dfa(); |  | ||||||
|  |  | ||||||
|                     let words = match main_store.words_fst(reader)? { |  | ||||||
|                         Some(words) => words, |  | ||||||
|                         None => return Ok(Vec::new()), |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     let mut stream = words.search(&dfa).into_stream(); |  | ||||||
|                     while let Some(input) = stream.next() { |  | ||||||
|                         let distance = dfa.eval(input).to_u8(); |  | ||||||
|                         let is_exact = *is_exact && distance == 0 && input.len() == *query_len; |  | ||||||
|  |  | ||||||
|                         let doc_indexes = match postings_lists_store.postings_list(reader, input)? { |  | ||||||
|                             Some(doc_indexes) => doc_indexes, |  | ||||||
|                             None => continue, |  | ||||||
|                         }; |                         }; | ||||||
|  |  | ||||||
|                         tmp_matches.reserve(doc_indexes.len()); |                         tmp_matches.push((di.document_id, id, match_, highlight)); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|                         for di in doc_indexes.as_ref() { |         if *is_phrase_query { | ||||||
|                             let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); |             tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); | ||||||
|                             if let Some(attribute) = attribute { |             for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { | ||||||
|                                 let match_ = TmpMatch { |                 for window in group.windows(2) { | ||||||
|                                     query_index: *index as u32, |                     let (ida, ia, ma, ha) = window[0]; | ||||||
|                                     distance, |                     let (idb, ib, mb, hb) = window[1]; | ||||||
|                                     attribute, |  | ||||||
|                                     word_index: di.word_index, |  | ||||||
|                                     is_exact, |  | ||||||
|                                 }; |  | ||||||
|  |  | ||||||
|                                 let highlight = Highlight { |                     debug_assert_eq!(ida, idb); | ||||||
|                                     attribute: di.attribute, |  | ||||||
|                                     char_index: di.char_index, |  | ||||||
|                                     char_length: di.char_length, |  | ||||||
|                                 }; |  | ||||||
|  |  | ||||||
|                                 tmp_matches.push((di.document_id, id, match_, highlight)); |                     // if matches must follow and actually follows themselves | ||||||
|                             } |                     if ia + 1 == ib && ma.word_index + 1 == mb.word_index { | ||||||
|                         } |                         // TODO we must make it work for phrase query longer than 2 | ||||||
|                     } |                         // if the second match is the last phrase query word | ||||||
|                 } |                         if ib + 1 == phrase_query_len { | ||||||
|  |                             // insert first match | ||||||
|                 tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); |                             matches.push((ida, ma)); | ||||||
|                 for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { |                             highlights.push((ida, ha)); | ||||||
|                     for window in group.windows(2) { |  | ||||||
|                         let (ida, ia, ma, ha) = window[0]; |                             // insert second match | ||||||
|                         let (idb, ib, mb, hb) = window[1]; |                             matches.push((idb, mb)); | ||||||
|  |                             highlights.push((idb, hb)); | ||||||
|                         debug_assert_eq!(ida, idb); |  | ||||||
|  |  | ||||||
|                         // if matches must follow and actually follows themselves |  | ||||||
|                         if ia + 1 == ib && ma.word_index + 1 == mb.word_index { |  | ||||||
|  |  | ||||||
|                             // TODO we must make it work for phrase query longer than 2 |  | ||||||
|                             // if the second match is the last phrase query word |  | ||||||
|                             if ib + 1 == phrase_query_len { |  | ||||||
|                                 // insert first match |  | ||||||
|                                 matches.push((ida, ma)); |  | ||||||
|                                 highlights.push((ida, ha)); |  | ||||||
|  |  | ||||||
|                                 // insert second match |  | ||||||
|                                 matches.push((idb, mb)); |  | ||||||
|                                 highlights.push((idb, hb)); |  | ||||||
|                             } |  | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |         } else { | ||||||
|  |             for (id, _, match_, highlight) in tmp_matches { | ||||||
|  |                 matches.push((id, match_)); | ||||||
|  |                 highlights.push((id, highlight)); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -442,8 +406,13 @@ where | |||||||
|     let start_processing = Instant::now(); |     let start_processing = Instant::now(); | ||||||
|     let mut raw_documents_processed = Vec::with_capacity(range.len()); |     let mut raw_documents_processed = Vec::with_capacity(range.len()); | ||||||
|  |  | ||||||
|     let (automaton_producer, query_enhancer) = |     let (automaton_producer, query_enhancer) = AutomatonProducer::new( | ||||||
|         AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?; |         reader, | ||||||
|  |         query, | ||||||
|  |         main_store, | ||||||
|  |         postings_lists_store, | ||||||
|  |         synonyms_store, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|     let automaton_producer = automaton_producer.into_iter(); |     let automaton_producer = automaton_producer.into_iter(); | ||||||
|     let mut automatons = Vec::new(); |     let mut automatons = Vec::new(); | ||||||
| @@ -555,8 +524,13 @@ where | |||||||
|     let start_processing = Instant::now(); |     let start_processing = Instant::now(); | ||||||
|     let mut raw_documents_processed = Vec::new(); |     let mut raw_documents_processed = Vec::new(); | ||||||
|  |  | ||||||
|     let (automaton_producer, query_enhancer) = |     let (automaton_producer, query_enhancer) = AutomatonProducer::new( | ||||||
|         AutomatonProducer::new(reader, query, main_store, postings_lists_store, synonyms_store)?; |         reader, | ||||||
|  |         query, | ||||||
|  |         main_store, | ||||||
|  |         postings_lists_store, | ||||||
|  |         synonyms_store, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|     let automaton_producer = automaton_producer.into_iter(); |     let automaton_producer = automaton_producer.into_iter(); | ||||||
|     let mut automatons = Vec::new(); |     let mut automatons = Vec::new(); | ||||||
| @@ -1778,9 +1752,8 @@ mod tests { | |||||||
|         let store = TempDatabase::from_iter(vec![ |         let store = TempDatabase::from_iter(vec![ | ||||||
|             ("search", &[doc_index(0, 0)][..]), |             ("search", &[doc_index(0, 0)][..]), | ||||||
|             ("engine", &[doc_index(0, 1)][..]), |             ("engine", &[doc_index(0, 1)][..]), | ||||||
|  |  | ||||||
|             ("search", &[doc_index(1, 0)][..]), |             ("search", &[doc_index(1, 0)][..]), | ||||||
|             ("slow",   &[doc_index(1, 1)][..]), |             ("slow", &[doc_index(1, 1)][..]), | ||||||
|             ("engine", &[doc_index(1, 2)][..]), |             ("engine", &[doc_index(1, 2)][..]), | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
| @@ -1806,15 +1779,13 @@ mod tests { | |||||||
|             ("search", &[doc_index(0, 0)][..]), |             ("search", &[doc_index(0, 0)][..]), | ||||||
|             ("search", &[doc_index(0, 1)][..]), |             ("search", &[doc_index(0, 1)][..]), | ||||||
|             ("engine", &[doc_index(0, 2)][..]), |             ("engine", &[doc_index(0, 2)][..]), | ||||||
|  |  | ||||||
|             ("search", &[doc_index(1, 0)][..]), |             ("search", &[doc_index(1, 0)][..]), | ||||||
|             ("slow",   &[doc_index(1, 1)][..]), |             ("slow", &[doc_index(1, 1)][..]), | ||||||
|             ("search", &[doc_index(1, 2)][..]), |             ("search", &[doc_index(1, 2)][..]), | ||||||
|             ("engine", &[doc_index(1, 3)][..]), |             ("engine", &[doc_index(1, 3)][..]), | ||||||
|  |  | ||||||
|             ("search", &[doc_index(1, 0)][..]), |             ("search", &[doc_index(1, 0)][..]), | ||||||
|             ("search", &[doc_index(1, 1)][..]), |             ("search", &[doc_index(1, 1)][..]), | ||||||
|             ("slow",   &[doc_index(1, 2)][..]), |             ("slow", &[doc_index(1, 2)][..]), | ||||||
|             ("engine", &[doc_index(1, 3)][..]), |             ("engine", &[doc_index(1, 3)][..]), | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user