mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Before improving fields AttrCount
Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion
This commit is contained in:
		
				
					committed by
					
						 Clément Renault
						Clément Renault
					
				
			
			
				
	
			
			
			
						parent
						
							11f3d7782d
						
					
				
				
					commit
					ef6a4db182
				
			| @@ -2,7 +2,7 @@ mod dfa; | |||||||
| mod query_enhancer; | mod query_enhancer; | ||||||
|  |  | ||||||
| use std::cmp::Reverse; | use std::cmp::Reverse; | ||||||
| use std::{cmp, vec}; | use std::{cmp, fmt, vec}; | ||||||
|  |  | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use levenshtein_automata::DFA; | use levenshtein_automata::DFA; | ||||||
| @@ -68,7 +68,6 @@ impl AutomatonGroup { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug)] |  | ||||||
| pub struct Automaton { | pub struct Automaton { | ||||||
|     pub index: usize, |     pub index: usize, | ||||||
|     pub ngram: usize, |     pub ngram: usize, | ||||||
| @@ -78,6 +77,14 @@ pub struct Automaton { | |||||||
|     pub query: String, |     pub query: String, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for Automaton { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|  |         f.debug_struct("Automaton") | ||||||
|  |             .field("query", &self.query) | ||||||
|  |             .finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl Automaton { | impl Automaton { | ||||||
|     pub fn dfa(&self) -> DFA { |     pub fn dfa(&self) -> DFA { | ||||||
|         if self.is_prefix { |         if self.is_prefix { | ||||||
|   | |||||||
| @@ -1,18 +1,17 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
|  |  | ||||||
| use meilisearch_schema::SchemaAttr; |  | ||||||
| use sdset::Set; | use sdset::Set; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
| use crate::criterion::Criterion; | use crate::criterion::Criterion; | ||||||
| use crate::RawDocument; | use crate::{AttrCount, RawDocument}; | ||||||
|  |  | ||||||
| #[inline] | #[inline] | ||||||
| fn number_exact_matches( | fn number_exact_matches( | ||||||
|     query_index: &[u32], |     query_index: &[u32], | ||||||
|     attribute: &[u16], |     attribute: &[u16], | ||||||
|     is_exact: &[bool], |     is_exact: &[bool], | ||||||
|     fields_counts: &Set<(SchemaAttr, u16)>, |     fields_counts: &Set<AttrCount>, | ||||||
| ) -> usize { | ) -> usize { | ||||||
|     let mut count = 0; |     let mut count = 0; | ||||||
|     let mut index = 0; |     let mut index = 0; | ||||||
| @@ -25,8 +24,8 @@ fn number_exact_matches( | |||||||
|             if *is_exact { |             if *is_exact { | ||||||
|                 found_exact = true; |                 found_exact = true; | ||||||
|                 let attr = &attribute[index + pos]; |                 let attr = &attribute[index + pos]; | ||||||
|                 if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) { |                 if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { | ||||||
|                     let (_, count) = fields_counts[pos]; |                     let AttrCount { count, .. } = fields_counts[pos]; | ||||||
|                     if count == 1 { |                     if count == 1 { | ||||||
|                         return usize::max_value(); |                         return usize::max_value(); | ||||||
|                     } |                     } | ||||||
| @@ -50,7 +49,7 @@ impl Criterion for Exact { | |||||||
|             let query_index = lhs.query_index(); |             let query_index = lhs.query_index(); | ||||||
|             let is_exact = lhs.is_exact(); |             let is_exact = lhs.is_exact(); | ||||||
|             let attribute = lhs.attribute(); |             let attribute = lhs.attribute(); | ||||||
|             let fields_counts = &lhs.fields_counts; |             let fields_counts = lhs.fields_counts.as_ref().unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
| @@ -59,7 +58,7 @@ impl Criterion for Exact { | |||||||
|             let query_index = rhs.query_index(); |             let query_index = rhs.query_index(); | ||||||
|             let is_exact = rhs.is_exact(); |             let is_exact = rhs.is_exact(); | ||||||
|             let attribute = rhs.attribute(); |             let attribute = rhs.attribute(); | ||||||
|             let fields_counts = &rhs.fields_counts; |             let fields_counts = rhs.fields_counts.as_ref().unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
| @@ -86,7 +85,7 @@ mod tests { | |||||||
|             let query_index = &[0]; |             let query_index = &[0]; | ||||||
|             let attribute = &[0]; |             let attribute = &[0]; | ||||||
|             let is_exact = &[true]; |             let is_exact = &[true]; | ||||||
|             let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); |             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
| @@ -95,7 +94,7 @@ mod tests { | |||||||
|             let query_index = &[0]; |             let query_index = &[0]; | ||||||
|             let attribute = &[0]; |             let attribute = &[0]; | ||||||
|             let is_exact = &[false]; |             let is_exact = &[false]; | ||||||
|             let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); |             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
| @@ -113,7 +112,7 @@ mod tests { | |||||||
|             let query_index = &[0]; |             let query_index = &[0]; | ||||||
|             let attribute = &[0]; |             let attribute = &[0]; | ||||||
|             let is_exact = &[true]; |             let is_exact = &[true]; | ||||||
|             let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); |             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
| @@ -122,7 +121,7 @@ mod tests { | |||||||
|             let query_index = &[0]; |             let query_index = &[0]; | ||||||
|             let attribute = &[0]; |             let attribute = &[0]; | ||||||
|             let is_exact = &[true]; |             let is_exact = &[true]; | ||||||
|             let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); |             let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); | ||||||
|  |  | ||||||
|             number_exact_matches(query_index, attribute, is_exact, fields_counts) |             number_exact_matches(query_index, attribute, is_exact, fields_counts) | ||||||
|         }; |         }; | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use std::time::{Duration, Instant}; | |||||||
| use std::{cmp, mem}; | use std::{cmp, mem}; | ||||||
|  |  | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
|  | use log::debug; | ||||||
| use sdset::SetBuf; | use sdset::SetBuf; | ||||||
| use slice_group_by::{GroupBy, GroupByMut}; | use slice_group_by::{GroupBy, GroupByMut}; | ||||||
|  |  | ||||||
| @@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc | |||||||
| use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; | use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; | ||||||
| use crate::levenshtein::prefix_damerau_levenshtein; | use crate::levenshtein::prefix_damerau_levenshtein; | ||||||
| use crate::raw_document::{raw_documents_from, RawDocument}; | use crate::raw_document::{raw_documents_from, RawDocument}; | ||||||
| use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; | use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; | ||||||
| use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; | ||||||
|  |  | ||||||
| pub struct QueryBuilder<'c, 'f, 'd> { | pub struct QueryBuilder<'c, 'f, 'd> { | ||||||
| @@ -146,27 +147,18 @@ fn fetch_raw_documents( | |||||||
|     searchables: Option<&ReorderedAttrs>, |     searchables: Option<&ReorderedAttrs>, | ||||||
|     main_store: store::Main, |     main_store: store::Main, | ||||||
|     postings_lists_store: store::PostingsLists, |     postings_lists_store: store::PostingsLists, | ||||||
|     documents_fields_counts_store: store::DocumentsFieldsCounts, |  | ||||||
| ) -> MResult<Vec<RawDocument>> { | ) -> MResult<Vec<RawDocument>> { | ||||||
|     let mut matches = Vec::new(); |     let mut matches = Vec::new(); | ||||||
|     let mut highlights = Vec::new(); |     let mut highlights = Vec::new(); | ||||||
|  |  | ||||||
|  |     let before_automatons_groups_loop = Instant::now(); | ||||||
|     for group in automatons_groups { |     for group in automatons_groups { | ||||||
|         let AutomatonGroup { |         let AutomatonGroup { is_phrase_query, automatons } = group; | ||||||
|             is_phrase_query, |  | ||||||
|             automatons, |  | ||||||
|         } = group; |  | ||||||
|         let phrase_query_len = automatons.len(); |         let phrase_query_len = automatons.len(); | ||||||
|  |  | ||||||
|         let mut tmp_matches = Vec::new(); |         let mut tmp_matches = Vec::new(); | ||||||
|         for (id, automaton) in automatons.into_iter().enumerate() { |         for (id, automaton) in automatons.into_iter().enumerate() { | ||||||
|             let Automaton { |             let Automaton { index, is_exact, query_len, query, .. } = automaton; | ||||||
|                 index, |  | ||||||
|                 is_exact, |  | ||||||
|                 query_len, |  | ||||||
|                 query, |  | ||||||
|                 .. |  | ||||||
|             } = automaton; |  | ||||||
|             let dfa = automaton.dfa(); |             let dfa = automaton.dfa(); | ||||||
|  |  | ||||||
|             let words = match main_store.words_fst(reader)? { |             let words = match main_store.words_fst(reader)? { | ||||||
| @@ -250,26 +242,26 @@ fn fetch_raw_documents( | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |     debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); | ||||||
|  |  | ||||||
|  |     let before_multiword_rewrite_matches = Instant::now(); | ||||||
|     let matches = multiword_rewrite_matches(matches, &query_enhancer); |     let matches = multiword_rewrite_matches(matches, &query_enhancer); | ||||||
|  |     debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); | ||||||
|  |  | ||||||
|  |     let before_highlight_sorting = Instant::now(); | ||||||
|     let highlights = { |     let highlights = { | ||||||
|         highlights.sort_unstable_by_key(|(id, _)| *id); |         highlights.sort_unstable_by_key(|(id, _)| *id); | ||||||
|         SetBuf::new_unchecked(highlights) |         SetBuf::new_unchecked(highlights) | ||||||
|     }; |     }; | ||||||
|  |     debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); | ||||||
|  |  | ||||||
|     let fields_counts = { |  | ||||||
|         let mut fields_counts = Vec::new(); |  | ||||||
|         for group in matches.linear_group_by_key(|(id, ..)| *id) { |  | ||||||
|             let id = group[0].0; |  | ||||||
|             for result in documents_fields_counts_store.document_fields_counts(reader, id)? { |  | ||||||
|                 let (attr, count) = result?; |  | ||||||
|                 fields_counts.push((id, attr, count)); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         SetBuf::new(fields_counts).unwrap() |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     Ok(raw_documents_from(matches, highlights, fields_counts)) |     let before_raw_documents = Instant::now(); | ||||||
|  |     let raw_documents = raw_documents_from(matches, highlights); | ||||||
|  |     debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); | ||||||
|  |     debug!("documents to worry about: {}", raw_documents.len()); | ||||||
|  |  | ||||||
|  |     Ok(raw_documents) | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { | ||||||
| @@ -434,6 +426,11 @@ where | |||||||
|     for auts in automaton_producer { |     for auts in automaton_producer { | ||||||
|         automatons.push(auts); |         automatons.push(auts); | ||||||
|  |  | ||||||
|  |         for (i, group) in automatons.iter().enumerate() { | ||||||
|  |             debug!("group {} automatons {:?}", i, group.automatons); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let before_fetch_raw_documents = Instant::now(); | ||||||
|         // we must retrieve the documents associated |         // we must retrieve the documents associated | ||||||
|         // with the current automatons |         // with the current automatons | ||||||
|         let mut raw_documents = fetch_raw_documents( |         let mut raw_documents = fetch_raw_documents( | ||||||
| @@ -443,8 +440,8 @@ where | |||||||
|             searchable_attrs.as_ref(), |             searchable_attrs.as_ref(), | ||||||
|             main_store, |             main_store, | ||||||
|             postings_lists_store, |             postings_lists_store, | ||||||
|             documents_fields_counts_store, |  | ||||||
|         )?; |         )?; | ||||||
|  |         debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed()); | ||||||
|  |  | ||||||
|         // stop processing when time is running out |         // stop processing when time is running out | ||||||
|         if let Some(timeout) = timeout { |         if let Some(timeout) = timeout { | ||||||
| @@ -468,6 +465,20 @@ where | |||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  |                 // we must pull the fields counts of these documents | ||||||
|  |                 // TODO it would be great to had a "dependency" thing for each criterion | ||||||
|  |                 //      and make it so that we can be lazy on pulling/computing some data. | ||||||
|  |                 if criterion.name() == "Exact" { | ||||||
|  |                     for document in group.iter_mut() { | ||||||
|  |                         let mut fields_counts = Vec::new(); | ||||||
|  |                         for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? { | ||||||
|  |                             let (attr, count) = result?; | ||||||
|  |                             fields_counts.push(AttrCount { attr: attr.0, count }); | ||||||
|  |                         } | ||||||
|  |                         document.fields_counts = Some(SetBuf::new(fields_counts).unwrap()); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); |                 group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); | ||||||
|  |  | ||||||
|                 for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { |                 for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { | ||||||
| @@ -561,7 +572,6 @@ where | |||||||
|             searchable_attrs.as_ref(), |             searchable_attrs.as_ref(), | ||||||
|             main_store, |             main_store, | ||||||
|             postings_lists_store, |             postings_lists_store, | ||||||
|             documents_fields_counts_store, |  | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         // stop processing when time is running out |         // stop processing when time is running out | ||||||
|   | |||||||
| @@ -1,18 +1,18 @@ | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
|  |  | ||||||
| use meilisearch_schema::SchemaAttr; |  | ||||||
| use sdset::SetBuf; | use sdset::SetBuf; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  | use log::debug; | ||||||
|  |  | ||||||
| use crate::{DocumentId, Highlight, TmpMatch}; | use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; | ||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| pub struct RawDocument { | pub struct RawDocument { | ||||||
|     pub id: DocumentId, |     pub id: DocumentId, | ||||||
|     pub matches: SharedMatches, |     pub matches: SharedMatches, | ||||||
|     pub highlights: Vec<Highlight>, |     pub highlights: Vec<Highlight>, | ||||||
|     pub fields_counts: SetBuf<(SchemaAttr, u16)>, |     pub fields_counts: Option<SetBuf<AttrCount>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl RawDocument { | impl RawDocument { | ||||||
| @@ -100,44 +100,47 @@ impl fmt::Debug for RawDocument { | |||||||
|  |  | ||||||
| pub fn raw_documents_from( | pub fn raw_documents_from( | ||||||
|     matches: SetBuf<(DocumentId, TmpMatch)>, |     matches: SetBuf<(DocumentId, TmpMatch)>, | ||||||
|     highlights: SetBuf<(DocumentId, Highlight)>, |     highlights: SetBuf<(DocumentId, Highlight)> | ||||||
|     fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>, |  | ||||||
| ) -> Vec<RawDocument> { | ) -> Vec<RawDocument> { | ||||||
|     let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); |     let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); | ||||||
|     let mut matches2 = Matches::with_capacity(matches.len()); |     let mut matches2 = Matches::with_capacity(matches.len()); | ||||||
|  |  | ||||||
|     let matches = matches.linear_group_by_key(|(id, _)| *id); |     let matches = matches.linear_group_by_key(|(id, _)| *id); | ||||||
|     let highlights = highlights.linear_group_by_key(|(id, _)| *id); |     let highlights = highlights.linear_group_by_key(|(id, _)| *id); | ||||||
|     let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); |  | ||||||
|  |  | ||||||
|     for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { |     let mut loops_count = 0; | ||||||
|         debug_assert_eq!(mgroup[0].0, hgroup[0].0); |  | ||||||
|         debug_assert_eq!(mgroup[0].0, fgroup[0].0); |     for (mgroup, hgroup) in matches.zip(highlights) { | ||||||
|  |         loops_count += 1; | ||||||
|  |         assert_eq!(mgroup[0].0, hgroup[0].0); | ||||||
|  |  | ||||||
|         let document_id = mgroup[0].0; |         let document_id = mgroup[0].0; | ||||||
|         let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); |         let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); | ||||||
|         let end = start + mgroup.len(); |         let end = start + mgroup.len(); | ||||||
|         let highlights = hgroup.iter().map(|(_, h)| *h).collect(); |         let highlights = hgroup.iter().map(|(_, h)| *h).collect(); | ||||||
|         let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); |         let fields_counts = None; | ||||||
|  |  | ||||||
|         docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); |         docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); | ||||||
|  |         // TODO we could try to keep both data | ||||||
|  |         //  - the data oriented one and the raw one, | ||||||
|  |         //  - the one that comes from the arguments of this function | ||||||
|  |         // This way we would be able to only produce data oriented lazily. | ||||||
|  |         // | ||||||
|  |         // For example the default first criterion is `SumOfTypos` | ||||||
|  |         // and just needs the `query_index` and the `distance` fields. | ||||||
|  |         // It would probably be good to avoid wasting time sorting other fields of documents | ||||||
|  |         // that will never ever reach the second criterion. | ||||||
|         matches2.extend_from_slice(mgroup); |         matches2.extend_from_slice(mgroup); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     debug!("loops_counts number is {}", loops_count); | ||||||
|  |  | ||||||
|     let matches = Arc::new(matches2); |     let matches = Arc::new(matches2); | ||||||
|     docs_ranges |     docs_ranges | ||||||
|         .into_iter() |         .into_iter() | ||||||
|         .map(|(id, range, highlights, fields_counts)| { |         .map(|(id, range, highlights, fields_counts)| { | ||||||
|             let matches = SharedMatches { |             let matches = SharedMatches { range, matches: matches.clone() }; | ||||||
|                 range, |             RawDocument { id, matches, highlights, fields_counts } | ||||||
|                 matches: matches.clone(), |  | ||||||
|             }; |  | ||||||
|             RawDocument { |  | ||||||
|                 id, |  | ||||||
|                 matches, |  | ||||||
|                 highlights, |  | ||||||
|                 fields_counts, |  | ||||||
|             } |  | ||||||
|         }) |         }) | ||||||
|         .collect() |         .collect() | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user